1 /*
2 * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */
30 /* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
31
32 /*
33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the project nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61 /*
62 * @file
63 * flowswitch IP Reassembly for both v4 and v6
64 *
65 * Implementation of IP packet fragmentation and reassembly.
66 *
67 */
68
69 #include <sys/domain.h>
70 #include <netinet/in.h>
71 #include <netinet/ip6.h>
72 #include <netinet/icmp6.h>
73 #include <skywalk/os_skywalk_private.h>
74 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
75 #include <skywalk/nexus/flowswitch/fsw_var.h>
76
77 #define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */
78 #define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */
79 #define IPFM_FRAG_TTL 60 /* RFC 2460 */
80 #define IPFM_TIMEOUT_TCALL_INTERVAL 1
81
82 static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
83 static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
84 static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
85
86 #if (DEVELOPMENT || DEBUG)
87 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
88 ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED,
89 &ipfm_max_frags_per_queue, 0, "");
90 #endif /* !DEVELOPMENT && !DEBUG */
91
92 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
93 CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, "");
94 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
95 ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED,
96 &ipfm_timeout_tcall_ival, 0, "");
97
98 static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
99 static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0);
100
101 /* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */
102 struct ipf {
103 struct ipf *ipf_down;
104 struct ipf *ipf_up;
105 struct __kern_packet *ipf_pkt;
106 int ipf_len; /* fragmentable part length */
107 int ipf_off; /* fragment offset */
108 uint16_t ipf_mff; /* more fragment bit in frag off */
109 };
110
111 /* @internal ip fragment lookup key */
112 struct ipf_key {
113 uint64_t ipfk_addr[4]; /* src + dst ip addr (v4/v6) */
114 uint32_t ipfk_ident; /* IP identification */
115 uint16_t ipfk_len; /* len of ipfk_addr field */
116 };
117
118 enum {
119 IPFK_LEN_V4 = 2 * sizeof(struct in_addr),
120 IPFK_LEN_V6 = 2 * sizeof(struct in6_addr),
121 };
122
123 /*
124 * @internal
125 * IP reassembly queue structure. Each fragment (struct ipf)
126 * being reassembled is attached to one of these structures.
127 */
128 struct ipfq {
129 struct ipf *ipfq_down; /* fragment chain */
130 struct ipf *ipfq_up;
131 struct ipfq *ipfq_next; /* queue chain */
132 struct ipfq *ipfq_prev;
133 uint64_t ipfq_timestamp; /* time of creation */
134 struct ipf_key ipfq_key; /* ipfq search key */
135 uint16_t ipfq_nfrag; /* # of fragments in queue */
136 uint16_t ipfq_unfraglen; /* len of unfragmentable part */
137 bool ipfq_is_dirty; /* q is dirty, don't use */
138 };
139
140 /*
141 * @internal (externally opaque)
142 * flowswitch IP Fragment Manager
143 */
144 struct fsw_ip_frag_mgr {
145 struct skoid ipfm_skoid;
146 struct ipfq ipfm_q; /* ip reassembly queues */
147 uint32_t ipfm_q_limit; /* limit # of reass queues */
148 uint32_t ipfm_q_count; /* # of allocated reass queues */
149 uint32_t ipfm_f_limit; /* limit # of ipfs */
150 uint32_t ipfm_f_count; /* current # of allocated ipfs */
151 decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */
152 thread_call_t ipfm_timeout_tcall; /* frag timeout thread */
153
154 struct ifnet *ipfm_ifp;
155 struct fsw_stats *ipfm_stats; /* indirect stats in fsw */
156 };
157
158 static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **,
159 struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *,
160 uint16_t *);
161 static int ipf_key_cmp(struct ipf_key *, struct ipf_key *);
162 static void ipf_enq(struct ipf *, struct ipf *);
163 static void ipf_deq(struct ipf *);
164 static void ipfq_insque(struct ipfq *, struct ipfq *);
165 static void ipfq_remque(struct ipfq *);
166 static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *,
167 void (*)(struct fsw_ip_frag_mgr *, struct ipf *));
168
169 static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
170 static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
171
172 static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how);
173 static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q);
174 static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
175 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *));
176 static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr);
177 static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f);
178 static void ipf_free_pkt(struct ipf *f);
179 static void ipfq_drain(struct fsw_ip_frag_mgr *);
180 static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
181 void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt,
182 int param);
183 void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f);
184
185 /* Create a flowswitch IP fragment manager. */
186 struct fsw_ip_frag_mgr *
fsw_ip_frag_mgr_create(struct nx_flowswitch * fsw,struct ifnet * ifp,size_t f_limit)187 fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp,
188 size_t f_limit)
189 {
190 struct fsw_ip_frag_mgr *mgr;
191
192 /* ipf/ipfq uses mbufs for IP fragment queue structures */
193 _CASSERT(sizeof(struct ipfq) <= _MLEN);
194 _CASSERT(sizeof(struct ipf) <= _MLEN);
195
196 ASSERT(ifp != NULL);
197
198 mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL,
199 skmem_tag_fsw_frag_mgr);
200
201 mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
202 lck_mtx_init(&mgr->ipfm_lock, &fsw_ipfm_lock_group, &fsw_ipfm_lock_attr);
203
204 mgr->ipfm_timeout_tcall =
205 thread_call_allocate_with_options(ipfq_timeout, mgr,
206 THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
207 VERIFY(mgr->ipfm_timeout_tcall != NULL);
208
209 mgr->ipfm_ifp = ifp;
210 mgr->ipfm_stats = &fsw->fsw_stats;
211
212 /* Use caller provided limit (caller knows pool size) */
213 ASSERT(f_limit >= 2 && f_limit < UINT32_MAX);
214 mgr->ipfm_f_limit = (uint32_t)f_limit;
215 mgr->ipfm_f_count = 0;
216 mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2);
217 mgr->ipfm_q_count = 0;
218
219 skoid_create(&mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), "ipfm", 0);
220 skoid_add_uint(&mgr->ipfm_skoid, "frag_limit", CTLFLAG_RW,
221 &mgr->ipfm_f_limit);
222 skoid_add_uint(&mgr->ipfm_skoid, "frag_count", CTLFLAG_RD,
223 &mgr->ipfm_f_count);
224 skoid_add_uint(&mgr->ipfm_skoid, "queue_limit", CTLFLAG_RW,
225 &mgr->ipfm_q_limit);
226 skoid_add_uint(&mgr->ipfm_skoid, "queue_count", CTLFLAG_RD,
227 &mgr->ipfm_q_count);
228 skoid_add_handler(&mgr->ipfm_skoid, "drain", CTLFLAG_RW,
229 ipfq_drain_sysctl, mgr, 0);
230
231 return mgr;
232 }
233
234 /* Free a flowswitch IP fragment manager. */
235 void
fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr * mgr)236 fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
237 {
238 thread_call_t tcall;
239
240 lck_mtx_lock(&mgr->ipfm_lock);
241 if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
242 lck_mtx_unlock(&mgr->ipfm_lock);
243 (void) thread_call_cancel_wait(tcall);
244 (void) thread_call_free(tcall);
245 mgr->ipfm_timeout_tcall = NULL;
246 lck_mtx_lock(&mgr->ipfm_lock);
247 }
248
249 ipfq_drain(mgr);
250
251 lck_mtx_unlock(&mgr->ipfm_lock);
252 lck_mtx_destroy(&mgr->ipfm_lock, &fsw_ipfm_lock_group);
253
254 skoid_destroy(&mgr->ipfm_skoid);
255 sk_free_type(struct fsw_ip_frag_mgr, mgr);
256 }
257
258 /*
259 * Reassemble a received IPv4 fragment.
260 *
261 * @param mgr
262 * fragment manager
263 * @param pkt
264 * received packet (must have ipv4 header validated)
265 * @param ip4
266 * pointer to the packet's IPv4 header
267 * @param nfrags
268 * number of fragments reassembled
269 * @return
270 * Successfully processed (not fully reassembled)
271 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
272 * Successfully reassembled
273 * ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
274 * *nfrags = number of all fragments (>0)
275 * Error
276 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
277 * *nfrags = 0
278 */
279 int
fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip * ip4,uint16_t * nfrags,uint16_t * tlen)280 fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
281 struct ip *ip4, uint16_t *nfrags, uint16_t *tlen)
282 {
283 struct ipf_key key;
284 uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
285 int err;
286
287 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
288
289 bcopy((void *)&ip4->ip_src, (void *)key.ipfk_addr, IPFK_LEN_V4);
290 key.ipfk_len = IPFK_LEN_V4;
291 key.ipfk_ident = (uint32_t)ip4->ip_id;
292
293 unfragpartlen = (uint16_t)(ip4->ip_hl << 2);
294 offflag = ntohs(ip4->ip_off);
295 fragoff = (uint16_t)(offflag << 3);
296 fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2);
297 fragflag = offflag & IP_MF;
298
299 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
300 fragflag, nfrags, tlen);
301
302 /*
303 * If packet has been reassembled compute the user data length.
304 */
305 if (*pkt != NULL) {
306 struct __kern_packet *p = *pkt;
307 struct ip *iph = (struct ip *)p->pkt_flow_ip_hdr;
308
309 p->pkt_flow_ulen = ntohs(iph->ip_len) -
310 p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
311 }
312 return err;
313 }
314
315 /*
316 * Reassemble a received IPv6 fragment.
317 *
318 * @param mgr
319 * fragment manager
320 * @param pkt
321 * received packet (must have ipv6 header validated)
322 * @param ip6
323 * pointer to the packet's IPv6 header
324 * @param ip6f
325 * pointer to the packet's IPv6 Fragment Header
326 * @param nfrags
327 * number of fragments reassembled
328 * @return
329 * Successfully processed (not fully reassembled)
330 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
331 * Successfully reassembled
332 * ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
333 * *nfrags = number of all fragments (>0)
334 * Error
335 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
336 * *nfrags = 0
337 */
338 int
fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip6_hdr * ip6,struct ip6_frag * ip6f,uint16_t * nfrags,uint16_t * tlen)339 fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
340 struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags,
341 uint16_t *tlen)
342 {
343 struct ipf_key key;
344 ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
345 uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
346 int err;
347
348 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
349
350 /* jumbo payload can't contain a fragment header */
351 if (ip6->ip6_plen == 0) {
352 *nfrags = 0;
353 return ERANGE;
354 }
355
356 ASSERT(ip6f_ptroff < UINT16_MAX);
357 ip6f_off = (uint16_t)ip6f_ptroff;
358 fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
359 fragpartlen = ntohs(ip6->ip6_plen) -
360 (ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
361 unfragpartlen = ip6f_off;
362 fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
363
364 /*
365 * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
366 * upfront, unrelated to any reassembly.
367 *
368 * Flow classifier should process those as non-frag, ipfm shouldn't see
369 * them.
370 */
371 ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0);
372
373 bcopy((void *)&ip6->ip6_src, (void *)key.ipfk_addr, IPFK_LEN_V6);
374 key.ipfk_len = IPFK_LEN_V6;
375 key.ipfk_ident = ip6f->ip6f_ident;
376
377 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
378 fragflag, nfrags, tlen);
379
380 /*
381 * If packet has been reassembled compute the user data length.
382 */
383 if (*pkt != NULL) {
384 struct __kern_packet *p = *pkt;
385 struct ip6_hdr *ip6h = (struct ip6_hdr *)p->pkt_flow_ip_hdr;
386
387 p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
388 p->pkt_flow->flow_l4._l4_hlen;
389 }
390 return err;
391 }
392
393 static struct mbuf *
ipf_pkt2mbuf(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt)394 ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt)
395 {
396 unsigned int one = 1;
397 struct mbuf *m = NULL;
398 uint8_t *buf;
399 struct ip6_hdr *ip6;
400 uint32_t l3t_len;
401 int err;
402
403 l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
404 if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
405 l3t_len -= ETHER_CRC_LEN;
406 }
407
408 err = mbuf_allocpacket(MBUF_WAITOK, l3t_len, &one, &m);
409 VERIFY(err == 0);
410 ASSERT(l3t_len <= mbuf_maxlen(m));
411
412 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
413 bcopy(pkt->pkt_mbuf->m_data + pkt->pkt_l2_len,
414 m->m_data, l3t_len);
415 } else {
416 MD_BUFLET_ADDR_ABS(pkt, buf);
417 buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
418 bcopy(buf, m->m_data, l3t_len);
419 }
420 m->m_pkthdr.len = m->m_len = l3t_len;
421
422 ip6 = mtod(m, struct ip6_hdr *);
423 /* note for casting: IN6_IS_SCOPE_ doesn't need alignment */
424 if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
425 if (in6_embedded_scope) {
426 ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
427 }
428 ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
429 }
430 if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
431 if (in6_embedded_scope) {
432 ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
433 }
434 ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
435 }
436
437 return m;
438 }
439
440 /*
441 * Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
442 * we need to ensure we don't enter the driver directly because a deadlock
443 * can happen if this same thread tries to get the workloop lock.
444 */
445 static void
ipf_icmp6_error_flag(struct mbuf * m,int type,int code,int param,int flags)446 ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
447 {
448 sk_protect_t protect = sk_async_transmit_protect();
449 icmp6_error_flag(m, type, code, param, flags);
450 sk_async_transmit_unprotect(protect);
451 }
452
453 /*
454 * @internal IP fragment ICMP parameter problem error handling
455 *
456 * @param param
457 * offending parameter offset, only applicable to ICMPv6
458 */
459 void
ipf_icmp_param_err(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt,int param_offset)460 ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt,
461 int param_offset)
462 {
463 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
464 return;
465 }
466
467 struct mbuf *m = NULL;
468 m = ipf_pkt2mbuf(mgr, pkt);
469 if (__probable(m != NULL)) {
470 ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
471 param_offset, 0);
472 }
473
474 /* m would be free by icmp6_error_flag function */
475 }
476
477 /* @internal IP fragment ICMP timeout error handling */
478 void
ipf_icmp_timeout_err(struct fsw_ip_frag_mgr * mgr,struct ipf * f)479 ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
480 {
481 struct __kern_packet *pkt = f->ipf_pkt;
482 ASSERT(pkt != NULL);
483
484 /* no icmp error packet for ipv4 */
485 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
486 return;
487 }
488
489 /* only for the first fragment */
490 if (f->ipf_off != 0) {
491 return;
492 }
493
494 struct mbuf *m = NULL;
495 m = ipf_pkt2mbuf(mgr, pkt);
496 if (__probable(m != NULL)) {
497 ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
498 ICMP6_TIME_EXCEED_REASSEMBLY, 0, 0);
499 }
500
501 /* m would be free by icmp6_error_flag function */
502 }
503
504 /* @internal IP fragment processing, v4/v6 agonistic */
505 int
ipf_process(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt_ptr,struct ipf_key * key,uint16_t unfraglen,uint16_t fragoff,uint16_t fragpartlen,uint16_t fragflag,uint16_t * nfrags,uint16_t * tlen)506 ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr,
507 struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
508 uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen)
509 {
510 struct __kern_packet *pkt = *pkt_ptr;
511 struct __kern_packet *pkt_reassed = NULL;
512 struct ipfq *q, *mq = &mgr->ipfm_q;
513 struct ipf *f, *f_new, *f_down;
514 uint32_t nfrags_freed;
515 int next;
516 int first_frag = 0;
517 int err = 0;
518
519 *nfrags = 0;
520
521 SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d "
522 "fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
523
524 /*
525 * Make sure that all fragments except last one have a data length
526 * that's a non-zero multiple of 8 bytes.
527 */
528 if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) {
529 SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
530 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
531 ipf_icmp_param_err(mgr, pkt,
532 offsetof(struct ip6_hdr, ip6_plen));
533 return ERANGE;
534 }
535
536 lck_mtx_lock(&mgr->ipfm_lock);
537
538 /* find ipfq */
539 for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
540 if (ipf_key_cmp(key, &q->ipfq_key) == 0) {
541 if (q->ipfq_is_dirty) {
542 SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
543 err = EINVAL;
544 goto done;
545 }
546 break;
547 }
548 }
549
550 /* not found, create new ipfq */
551 if (q == mq) {
552 first_frag = 1;
553
554 q = ipfq_alloc(mgr, M_DONTWAIT);
555 if (q == NULL) {
556 STATS_INC(mgr->ipfm_stats,
557 FSW_STATS_RX_FRAG_DROP_NOMEM);
558 err = ENOMEM;
559 goto done;
560 }
561
562 ipfq_insque(q, mq);
563 net_update_uptime();
564
565 bcopy(key, &q->ipfq_key, sizeof(struct ipf_key));
566 q->ipfq_down = q->ipfq_up = (struct ipf *)q;
567 q->ipfq_unfraglen = 0;
568 q->ipfq_nfrag = 0;
569 q->ipfq_timestamp = _net_uptime;
570 }
571
572 ASSERT(!q->ipfq_is_dirty);
573
574 /* this queue has reached per queue frag limit */
575 if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
576 nfrags_freed = ipfq_freefq(mgr, q, NULL);
577 STATS_ADD(mgr->ipfm_stats,
578 FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
579 err = ENOMEM;
580 goto done;
581 }
582
583 /*
584 * If it's the 1st fragment, record the length of the
585 * unfragmentable part and the next header of the fragment header.
586 */
587 if (fragoff == 0) {
588 q->ipfq_unfraglen = unfraglen;
589 }
590
591 /* Check that the reassembled packet would not exceed 65535 bytes. */
592 if (q->ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
593 SK_DF(SK_VERB_IP_FRAG, "frag too big");
594 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
595 ipf_icmp_param_err(mgr, pkt, sizeof(struct ip6_hdr) +
596 offsetof(struct ip6_frag, ip6f_offlg));
597 err = ERANGE;
598 goto done;
599 }
600
601 /*
602 * If it's the 1st fragment, do the above check for each
603 * fragment already stored in the reassembly queue.
604 * If an error is found, still return 0, since we don't return
605 * ownership of a chain of offending packets back to caller.
606 */
607 if (fragoff == 0) {
608 for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
609 f_down = f->ipf_down;
610 if (q->ipfq_unfraglen + f->ipf_off + f->ipf_len >
611 IP_MAXPACKET) {
612 SK_DF(SK_VERB_IP_FRAG, "frag too big");
613 STATS_INC(mgr->ipfm_stats,
614 FSW_STATS_RX_FRAG_BAD);
615 ipf_deq(f);
616 ipf_free_pkt(f);
617 ipf_free(mgr, f);
618 }
619 }
620 }
621
622 f_new = ipf_alloc(mgr);
623 if (f_new == NULL) {
624 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
625 err = ENOMEM;
626 goto done;
627 }
628
629 f_new->ipf_mff = fragflag;
630 f_new->ipf_off = fragoff;
631 f_new->ipf_len = fragpartlen;
632 f_new->ipf_pkt = pkt;
633
634 if (first_frag) {
635 f = (struct ipf *)q;
636 goto insert;
637 }
638
639 /* Find a segment which begins after this one does. */
640 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
641 if (f->ipf_off > f_new->ipf_off) {
642 break;
643 }
644 }
645
646 /*
647 * If any of the fragments being reassembled overlap with any
648 * other fragments being reassembled for the same packet,
649 * reassembly of that packet must be abandoned and all the
650 * fragments that have been received for that packet must be
651 * discarded, and no ICMP error messages should be sent.
652 *
653 * It should be noted that fragments may be duplicated in the
654 * network. Instead of treating these exact duplicate fragments
655 * as overlapping fragments, an implementation may choose to
656 * detect this case and drop exact duplicate fragments while
657 * keeping the other fragments belonging to the same packet.
658 *
659 * https://tools.ietf.org/html/rfc8200#appendix-B
660 *
661 * We apply this rule for both for IPv4 and IPv6 here.
662 */
663 if (((f->ipf_up != (struct ipf *)q) && /* prev frag spans into f_new */
664 (f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) ||
665 ((f != (struct ipf *)q) && /* f_new spans into next */
666 (f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) {
667 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
668 /* Check for exact duplicate offset/length */
669 if (((f->ipf_up != (struct ipf *)q) &&
670 ((f->ipf_up->ipf_off != f_new->ipf_off) ||
671 (f->ipf_up->ipf_len != f_new->ipf_len))) ||
672 ((f != (struct ipf *)q) &&
673 ((f->ipf_off != f_new->ipf_off) ||
674 (f->ipf_len != f_new->ipf_len)))) {
675 SK_DF(SK_VERB_IP_FRAG, "frag overlap");
676 ipf_free(mgr, f_new);
677 /* give up over-lapping fragments queue */
678 SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
679 ipfq_freef(mgr, q, NULL);
680 q->ipfq_is_dirty = true;
681 } else {
682 ipf_free(mgr, f_new);
683 SK_DF(SK_VERB_IP_FRAG, "frag dup");
684 }
685 err = ERANGE;
686 goto done;
687 }
688
689 insert:
690 /*
691 * Stick new segment in its place;
692 * check for complete reassembly.
693 * Move to front of packet queue, as we are
694 * the most recently active fragmented packet.
695 */
696 ipf_enq(f_new, f->ipf_up);
697 q->ipfq_nfrag++;
698 next = 0;
699 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
700 /* there is a hole */
701 if (f->ipf_off != next) {
702 goto done;
703 }
704 next += f->ipf_len;
705 }
706 /* we haven't got last frag yet */
707 if (f->ipf_up->ipf_mff) {
708 goto done;
709 }
710
711 /*
712 * Reassembly is complete; concatenate fragments.
713 */
714 f = q->ipfq_down;
715 f_down = f->ipf_down;
716 pkt_reassed = f->ipf_pkt;
717 *nfrags = 1;
718 while (f_down != (struct ipf *)q) {
719 /* chain __kern_packet with pkt_nextpkt ptr */
720 f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
721 (*nfrags)++;
722 (*tlen) += f_down->ipf_len;
723 f_down = f->ipf_down;
724 ipf_deq(f);
725 ipf_free(mgr, f);
726 f = f_down;
727 f_down = f->ipf_down;
728 }
729 ipf_free(mgr, f);
730
731 err = 0;
732 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
733 ipfq_remque(q);
734 ipfq_free(mgr, q);
735
736 done:
737 /* ipfm take ownership of, or return assembled packet, if no error */
738 if (err == 0) {
739 /* reass'ed packet if done; NULL otherwise */
740 *pkt_ptr = pkt_reassed;
741 }
742 ipfq_sched_timeout(mgr, FALSE);
743 lck_mtx_unlock(&mgr->ipfm_lock);
744 return err;
745 }
746
747 static int
ipf_key_cmp(struct ipf_key * a,struct ipf_key * b)748 ipf_key_cmp(struct ipf_key *a, struct ipf_key *b)
749 {
750 int d;
751
752 if ((d = (a->ipfk_len - b->ipfk_len)) != 0) {
753 return d;
754 }
755
756 if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) {
757 return d;
758 }
759
760 return memcmp(a->ipfk_addr, b->ipfk_addr, a->ipfk_len);
761 }
762
763 /*
764 * Put an ip fragment on a reassembly chain.
765 * Like insque, but pointers in middle of structure.
766 */
767 static void
ipf_enq(struct ipf * f,struct ipf * up6)768 ipf_enq(struct ipf *f, struct ipf *up6)
769 {
770 f->ipf_up = up6;
771 f->ipf_down = up6->ipf_down;
772 up6->ipf_down->ipf_up = f;
773 up6->ipf_down = f;
774 }
775
776 /*
777 * To ipf_enq as remque is to insque.
778 */
779 static void
ipf_deq(struct ipf * f)780 ipf_deq(struct ipf *f)
781 {
782 f->ipf_up->ipf_down = f->ipf_down;
783 f->ipf_down->ipf_up = f->ipf_up;
784 }
785
786 static void
ipfq_insque(struct ipfq * new,struct ipfq * old)787 ipfq_insque(struct ipfq *new, struct ipfq *old)
788 {
789 new->ipfq_prev = old;
790 new->ipfq_next = old->ipfq_next;
791 old->ipfq_next->ipfq_prev = new;
792 old->ipfq_next = new;
793 }
794
795 static void
ipfq_remque(struct ipfq * p6)796 ipfq_remque(struct ipfq *p6)
797 {
798 p6->ipfq_prev->ipfq_next = p6->ipfq_next;
799 p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
800 }
801
802 static void
ipfq_timeout(thread_call_param_t arg0,thread_call_param_t arg1)803 ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
804 {
805 #pragma unused(arg1)
806 struct fsw_ip_frag_mgr *mgr = arg0;
807 struct ipfq *q;
808 uint64_t now, elapsed;
809 uint32_t nfreed = 0;
810
811 net_update_uptime();
812 now = _net_uptime;
813
814 SK_DF(SK_VERB_IP_FRAG, "run");
815 lck_mtx_lock(&mgr->ipfm_lock);
816 q = mgr->ipfm_q.ipfq_next;
817 if (q) {
818 while (q != &mgr->ipfm_q) {
819 q = q->ipfq_next;
820 elapsed = now - q->ipfq_prev->ipfq_timestamp;
821 if (elapsed > ipfm_frag_ttl) {
822 SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
823 q->ipfq_prev->ipfq_key.ipfk_ident);
824 nfreed = ipfq_freefq(mgr, q->ipfq_prev,
825 q->ipfq_is_dirty ? NULL :
826 ipf_icmp_timeout_err);
827 }
828 }
829 }
830
831 /* If running out of resources, drain ipfm queues (oldest one first) */
832 if (mgr->ipfm_f_count >= mgr->ipfm_f_limit ||
833 mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
834 SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
835 mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
836 mgr->ipfm_q_limit);
837 uint32_t target_q_count = mgr->ipfm_q_count / 2;
838 while (mgr->ipfm_q_count > target_q_count) {
839 nfreed += ipfq_freefq(mgr, mgr->ipfm_q.ipfq_prev,
840 q->ipfq_is_dirty ? NULL : ipf_icmp_timeout_err);
841 }
842 }
843
844 STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, nfreed);
845
846 /* re-arm the purge timer if there's work to do */
847 if (mgr->ipfm_q_count > 0) {
848 ipfq_sched_timeout(mgr, TRUE);
849 }
850 lck_mtx_unlock(&mgr->ipfm_lock);
851 }
852
853 static void
ipfq_sched_timeout(struct fsw_ip_frag_mgr * mgr,boolean_t in_tcall)854 ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
855 {
856 uint32_t delay = MAX(1, ipfm_timeout_tcall_ival); /* seconds */
857 thread_call_t tcall = mgr->ipfm_timeout_tcall;
858 uint64_t now = mach_absolute_time();
859 uint64_t ival, deadline = now;
860
861 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
862
863 ASSERT(tcall != NULL);
864 if (mgr->ipfm_q_count > 0 &&
865 (!thread_call_isactive(tcall) || in_tcall)) {
866 nanoseconds_to_absolutetime(delay * NSEC_PER_SEC, &ival);
867 clock_deadline_for_periodic_event(ival, now, &deadline);
868 (void) thread_call_enter_delayed(tcall, deadline);
869 }
870 }
871
872 /*
873 * @internal drain all ressambly queue for shutdown.
874 *
875 * @discussion Shutdown is called when if_detach happens, so no time exceeded
876 * icmp error are generated here.
877 */
878 static void
ipfq_drain(struct fsw_ip_frag_mgr * mgr)879 ipfq_drain(struct fsw_ip_frag_mgr *mgr)
880 {
881 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
882 while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q) {
883 ipfq_freefq(mgr, mgr->ipfm_q.ipfq_next, NULL);
884 }
885 }
886
887 static int
888 ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
889 {
890 #pragma unused(oidp, arg2)
891 struct fsw_ip_frag_mgr *mgr = arg1;
892
893 SKOID_PROC_CALL_GUARD;
894
895 lck_mtx_lock(&mgr->ipfm_lock);
896 ipfq_drain(mgr);
897 lck_mtx_unlock(&mgr->ipfm_lock);
898
899 return 0;
900 }
901
902 static struct ipfq *
ipfq_alloc(struct fsw_ip_frag_mgr * mgr,int how)903 ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how)
904 {
905 struct mbuf *t;
906 struct ipfq *q;
907
908 if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
909 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_QUEUE_LIMIT);
910 return NULL;
911 }
912
913 t = m_get(how, MT_FTABLE);
914 if (t != NULL) {
915 mgr->ipfm_q_count++;
916 q = mtod(t, struct ipfq *);
917 bzero(q, sizeof(*q));
918 q->ipfq_is_dirty = false;
919 } else {
920 q = NULL;
921 }
922 return q;
923 }
924
925 /* free q */
926 static void
ipfq_free(struct fsw_ip_frag_mgr * mgr,struct ipfq * q)927 ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q)
928 {
929 (void) m_free(dtom(q));
930 mgr->ipfm_q_count--;
931 }
932
933 /*
934 * Free all fragments, keep q.
935 * @return: number of frags freed
936 */
937 static uint32_t
ipfq_freef(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))938 ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
939 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
940 {
941 struct ipf *f, *down6;
942 uint32_t nfrags = 0;
943
944 for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
945 nfrags++;
946 down6 = f->ipf_down;
947 ipf_deq(f);
948 if (ipf_cb != NULL) {
949 (*ipf_cb)(mgr, f);
950 }
951 ipf_free_pkt(f);
952 ipf_free(mgr, f);
953 }
954
955 return nfrags;
956 }
957
958 /* Free both all fragments and q
959 * @return: number of frags freed
960 */
961 static uint32_t
ipfq_freefq(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))962 ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
963 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
964 {
965 uint32_t freed_count;
966 freed_count = ipfq_freef(mgr, q, ipf_cb);
967 ipfq_remque(q);
968 ipfq_free(mgr, q);
969 return freed_count;
970 }
971
972 static struct ipf *
ipf_alloc(struct fsw_ip_frag_mgr * mgr)973 ipf_alloc(struct fsw_ip_frag_mgr *mgr)
974 {
975 struct mbuf *t;
976 struct ipf *f;
977
978 if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
979 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
980 return NULL;
981 }
982
983 t = m_get(M_DONTWAIT, MT_FTABLE);
984 if (t != NULL) {
985 mgr->ipfm_f_count++;
986 f = mtod(t, struct ipf *);
987 bzero(f, sizeof(*f));
988 } else {
989 f = NULL;
990 }
991 return f;
992 }
993
994 static void
ipf_free_pkt(struct ipf * f)995 ipf_free_pkt(struct ipf *f)
996 {
997 struct __kern_packet *pkt = f->ipf_pkt;
998 ASSERT(pkt != NULL);
999 pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
1000 SK_PTR_ADDR(pkt));
1001 }
1002
1003 static void
ipf_free(struct fsw_ip_frag_mgr * mgr,struct ipf * f)1004 ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
1005 {
1006 (void) m_free(dtom(f));
1007 mgr->ipfm_f_count--;
1008 }
1009