1 /*
2 * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */
30 /* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
31
32 /*
33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the project nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61 /*
62 * @file
63 * flowswitch IP Reassembly for both v4 and v6
64 *
65 * Implementation of IP packet fragmentation and reassembly.
66 *
67 */
68
69 #include <sys/domain.h>
70 #include <netinet/in.h>
71 #include <netinet/ip6.h>
72 #include <netinet/icmp6.h>
73 #include <skywalk/os_skywalk_private.h>
74 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
75 #include <skywalk/nexus/flowswitch/fsw_var.h>
76
77 #define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */
78 #define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */
79 #define IPFM_FRAG_TTL 60 /* RFC 2460 */
80 #define IPFM_TIMEOUT_TCALL_INTERVAL 1
81
82 static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
83 static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
84 static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
85
86 #if (DEVELOPMENT || DEBUG)
87 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
88 ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED,
89 &ipfm_max_frags_per_queue, 0, "");
90 #endif /* !DEVELOPMENT && !DEBUG */
91
92 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
93 CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, "");
94 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
95 ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED,
96 &ipfm_timeout_tcall_ival, 0, "");
97
98 static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
99 static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0);
100
101 /* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */
102 struct ipf {
103 struct ipf *ipf_down;
104 struct ipf *ipf_up;
105 struct __kern_packet *ipf_pkt;
106 int ipf_len; /* fragmentable part length */
107 int ipf_off; /* fragment offset */
108 uint16_t ipf_mff; /* more fragment bit in frag off */
109 };
110
111 /* @internal ip fragment lookup key */
112 struct ipf_key {
113 uint64_t ipfk_addr[4]; /* src + dst ip addr (v4/v6) */
114 uint32_t ipfk_ident; /* IP identification */
115 uint16_t ipfk_len; /* len of ipfk_addr field */
116 };
117
118 enum {
119 IPFK_LEN_V4 = 2 * sizeof(struct in_addr),
120 IPFK_LEN_V6 = 2 * sizeof(struct in6_addr),
121 };
122
123 /*
124 * @internal
125 * IP reassembly queue structure. Each fragment (struct ipf)
126 * being reassembled is attached to one of these structures.
127 */
128 struct ipfq {
129 struct ipf *ipfq_down; /* fragment chain */
130 struct ipf *ipfq_up;
131 struct ipfq *ipfq_next; /* queue chain */
132 struct ipfq *ipfq_prev;
133 uint64_t ipfq_timestamp; /* time of creation */
134 struct ipf_key ipfq_key; /* ipfq search key */
135 uint16_t ipfq_nfrag; /* # of fragments in queue */
136 int ipfq_unfraglen; /* len of unfragmentable part */
137 bool ipfq_is_dirty; /* q is dirty, don't use */
138 };
139
140 /*
141 * @internal (externally opaque)
142 * flowswitch IP Fragment Manager
143 */
144 struct fsw_ip_frag_mgr {
145 struct skoid ipfm_skoid;
146 struct ipfq ipfm_q; /* ip reassembly queues */
147 uint32_t ipfm_q_limit; /* limit # of reass queues */
148 uint32_t ipfm_q_count; /* # of allocated reass queues */
149 uint32_t ipfm_f_limit; /* limit # of ipfs */
150 uint32_t ipfm_f_count; /* current # of allocated ipfs */
151 decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */
152 thread_call_t ipfm_timeout_tcall; /* frag timeout thread */
153
154 struct ifnet *ipfm_ifp;
155 struct fsw_stats *ipfm_stats; /* indirect stats in fsw */
156 };
157
158 static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **,
159 struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *,
160 uint16_t *);
161 static int ipf_key_cmp(struct ipf_key *, struct ipf_key *);
162 static void ipf_enq(struct ipf *, struct ipf *);
163 static void ipf_deq(struct ipf *);
164 static void ipfq_insque(struct ipfq *, struct ipfq *);
165 static void ipfq_remque(struct ipfq *);
166 static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *,
167 void (*)(struct fsw_ip_frag_mgr *, struct ipf *));
168
169 static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
170 static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
171
172 static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how);
173 static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q);
174 static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
175 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *));
176 static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr);
177 static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f);
178 static void ipf_free_pkt(struct ipf *f);
179 static void ipfq_drain(struct fsw_ip_frag_mgr *mgr);
180 static void ipfq_reap(struct fsw_ip_frag_mgr *mgr);
181 static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
182 void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt,
183 int param);
184 void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f);
185
186 /* Create a flowswitch IP fragment manager. */
187 struct fsw_ip_frag_mgr *
fsw_ip_frag_mgr_create(struct nx_flowswitch * fsw,struct ifnet * ifp,size_t f_limit)188 fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp,
189 size_t f_limit)
190 {
191 struct fsw_ip_frag_mgr *mgr;
192
193 /* ipf/ipfq uses mbufs for IP fragment queue structures */
194 _CASSERT(sizeof(struct ipfq) <= _MLEN);
195 _CASSERT(sizeof(struct ipf) <= _MLEN);
196
197 ASSERT(ifp != NULL);
198
199 mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL,
200 skmem_tag_fsw_frag_mgr);
201
202 mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
203 lck_mtx_init(&mgr->ipfm_lock, &fsw_ipfm_lock_group, &fsw_ipfm_lock_attr);
204
205 mgr->ipfm_timeout_tcall =
206 thread_call_allocate_with_options(ipfq_timeout, mgr,
207 THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
208 VERIFY(mgr->ipfm_timeout_tcall != NULL);
209
210 mgr->ipfm_ifp = ifp;
211 mgr->ipfm_stats = &fsw->fsw_stats;
212
213 /* Use caller provided limit (caller knows pool size) */
214 ASSERT(f_limit >= 2 && f_limit < UINT32_MAX);
215 mgr->ipfm_f_limit = (uint32_t)f_limit;
216 mgr->ipfm_f_count = 0;
217 mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2);
218 mgr->ipfm_q_count = 0;
219
220 skoid_create(&mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), "ipfm", 0);
221 skoid_add_uint(&mgr->ipfm_skoid, "frag_limit", CTLFLAG_RW,
222 &mgr->ipfm_f_limit);
223 skoid_add_uint(&mgr->ipfm_skoid, "frag_count", CTLFLAG_RD,
224 &mgr->ipfm_f_count);
225 skoid_add_uint(&mgr->ipfm_skoid, "queue_limit", CTLFLAG_RW,
226 &mgr->ipfm_q_limit);
227 skoid_add_uint(&mgr->ipfm_skoid, "queue_count", CTLFLAG_RD,
228 &mgr->ipfm_q_count);
229 skoid_add_handler(&mgr->ipfm_skoid, "drain", CTLFLAG_RW,
230 ipfq_drain_sysctl, mgr, 0);
231
232 return mgr;
233 }
234
235 /* Free a flowswitch IP fragment manager. */
236 void
fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr * mgr)237 fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
238 {
239 thread_call_t tcall;
240
241 lck_mtx_lock(&mgr->ipfm_lock);
242 if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
243 lck_mtx_unlock(&mgr->ipfm_lock);
244 (void) thread_call_cancel_wait(tcall);
245 (void) thread_call_free(tcall);
246 mgr->ipfm_timeout_tcall = NULL;
247 lck_mtx_lock(&mgr->ipfm_lock);
248 }
249
250 ipfq_drain(mgr);
251
252 lck_mtx_unlock(&mgr->ipfm_lock);
253 lck_mtx_destroy(&mgr->ipfm_lock, &fsw_ipfm_lock_group);
254
255 skoid_destroy(&mgr->ipfm_skoid);
256 sk_free_type(struct fsw_ip_frag_mgr, mgr);
257 }
258
259 /*
260 * Reassemble a received IPv4 fragment.
261 *
262 * @param mgr
263 * fragment manager
264 * @param pkt
265 * received packet (must have ipv4 header validated)
266 * @param ip4
267 * pointer to the packet's IPv4 header
268 * @param nfrags
269 * number of fragments reassembled
270 * @return
271 * Successfully processed (not fully reassembled)
272 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
273 * Successfully reassembled
274 * ret = 0, *pkt = 1st fragment(fragments chained in order by pkt_nextpkt)
275 * *nfrags = number of all fragments (>0)
276 * Error
277 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
278 * *nfrags = 0
279 */
280 int
fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip * ip4,uint16_t * nfrags,uint16_t * tlen)281 fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
282 struct ip *ip4, uint16_t *nfrags, uint16_t *tlen)
283 {
284 struct ipf_key key;
285 uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
286 int err;
287
288 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
289
290 bcopy((void *)&ip4->ip_src, (void *)key.ipfk_addr, IPFK_LEN_V4);
291 key.ipfk_len = IPFK_LEN_V4;
292 key.ipfk_ident = (uint32_t)ip4->ip_id;
293
294 unfragpartlen = (uint16_t)(ip4->ip_hl << 2);
295 offflag = ntohs(ip4->ip_off);
296 fragoff = (uint16_t)(offflag << 3);
297 fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2);
298 fragflag = offflag & IP_MF;
299
300 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
301 fragflag, nfrags, tlen);
302
303 /*
304 * If packet has been reassembled compute the user data length.
305 */
306 if (*pkt != NULL) {
307 struct __kern_packet *p = *pkt;
308 struct ip *iph = (struct ip *)p->pkt_flow_ip_hdr;
309
310 p->pkt_flow_ulen = ntohs(iph->ip_len) -
311 p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
312 }
313 return err;
314 }
315
316 /*
317 * Reassemble a received IPv6 fragment.
318 *
319 * @param mgr
320 * fragment manager
321 * @param pkt
322 * received packet (must have ipv6 header validated)
323 * @param ip6
324 * pointer to the packet's IPv6 header
325 * @param ip6f
326 * pointer to the packet's IPv6 Fragment Header
327 * @param nfrags
328 * number of fragments reassembled
329 * @return
330 * Successfully processed (not fully reassembled)
331 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
332 * Successfully reassembled
333 * ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
334 * *nfrags = number of all fragments (>0)
335 * Error
336 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
337 * *nfrags = 0
338 */
339 int
fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip6_hdr * ip6,struct ip6_frag * ip6f,uint16_t * nfrags,uint16_t * tlen)340 fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
341 struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags,
342 uint16_t *tlen)
343 {
344 struct ipf_key key;
345 ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
346 uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
347 int err;
348
349 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
350
351 /* jumbo payload can't contain a fragment header */
352 if (ip6->ip6_plen == 0) {
353 *nfrags = 0;
354 return ERANGE;
355 }
356
357 ASSERT(ip6f_ptroff < UINT16_MAX);
358 ip6f_off = (uint16_t)ip6f_ptroff;
359 fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
360 fragpartlen = ntohs(ip6->ip6_plen) -
361 (ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
362 unfragpartlen = ip6f_off;
363 fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
364
365 /*
366 * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
367 * upfront, unrelated to any reassembly.
368 *
369 * Flow classifier should process those as non-frag, ipfm shouldn't see
370 * them.
371 */
372 ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0);
373
374 bcopy((void *)&ip6->ip6_src, (void *)key.ipfk_addr, IPFK_LEN_V6);
375 key.ipfk_len = IPFK_LEN_V6;
376 key.ipfk_ident = ip6f->ip6f_ident;
377
378 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
379 fragflag, nfrags, tlen);
380
381 /*
382 * If packet has been reassembled compute the user data length.
383 */
384 if (*pkt != NULL) {
385 struct __kern_packet *p = *pkt;
386 struct ip6_hdr *ip6h = (struct ip6_hdr *)p->pkt_flow_ip_hdr;
387
388 p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
389 p->pkt_flow->flow_l4._l4_hlen;
390 }
391 return err;
392 }
393
394 static struct mbuf *
ipf_pkt2mbuf(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt)395 ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt)
396 {
397 unsigned int one = 1;
398 struct mbuf *m = NULL;
399 uint8_t *buf;
400 struct ip6_hdr *ip6;
401 uint32_t l3t_len;
402 int err;
403
404 l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
405 if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
406 l3t_len -= ETHER_CRC_LEN;
407 }
408
409 err = mbuf_allocpacket(MBUF_WAITOK, l3t_len, &one, &m);
410 VERIFY(err == 0);
411 ASSERT(l3t_len <= mbuf_maxlen(m));
412
413 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
414 bcopy(pkt->pkt_mbuf->m_data + pkt->pkt_l2_len,
415 m->m_data, l3t_len);
416 } else {
417 MD_BUFLET_ADDR_ABS(pkt, buf);
418 buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
419 bcopy(buf, m->m_data, l3t_len);
420 }
421 m->m_pkthdr.len = m->m_len = l3t_len;
422
423 ip6 = mtod(m, struct ip6_hdr *);
424 /* note for casting: IN6_IS_SCOPE_ doesn't need alignment */
425 if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
426 if (in6_embedded_scope) {
427 ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
428 }
429 ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
430 }
431 if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
432 if (in6_embedded_scope) {
433 ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
434 }
435 ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
436 }
437
438 return m;
439 }
440
441 /*
442 * Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
443 * we need to ensure we don't enter the driver directly because a deadlock
444 * can happen if this same thread tries to get the workloop lock.
445 */
446 static void
ipf_icmp6_error_flag(struct mbuf * m,int type,int code,int param,int flags)447 ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
448 {
449 sk_protect_t protect = sk_async_transmit_protect();
450 icmp6_error_flag(m, type, code, param, flags);
451 sk_async_transmit_unprotect(protect);
452 }
453
454 /*
455 * @internal IP fragment ICMP parameter problem error handling
456 *
457 * @param param
458 * offending parameter offset, only applicable to ICMPv6
459 */
460 void
ipf_icmp_param_err(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt,int param_offset)461 ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt,
462 int param_offset)
463 {
464 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
465 return;
466 }
467
468 struct mbuf *m = NULL;
469 m = ipf_pkt2mbuf(mgr, pkt);
470 if (__probable(m != NULL)) {
471 ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
472 param_offset, 0);
473 }
474
475 /* m would be free by icmp6_error_flag function */
476 }
477
478 /* @internal IP fragment ICMP timeout error handling */
479 void
ipf_icmp_timeout_err(struct fsw_ip_frag_mgr * mgr,struct ipf * f)480 ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
481 {
482 struct __kern_packet *pkt = f->ipf_pkt;
483 ASSERT(pkt != NULL);
484
485 /* no icmp error packet for ipv4 */
486 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
487 return;
488 }
489
490 /* only for the first fragment */
491 if (f->ipf_off != 0) {
492 return;
493 }
494
495 struct mbuf *m = NULL;
496 m = ipf_pkt2mbuf(mgr, pkt);
497 if (__probable(m != NULL)) {
498 ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
499 ICMP6_TIME_EXCEED_REASSEMBLY, 0, 0);
500 }
501
502 /* m would be free by icmp6_error_flag function */
503 }
504
505 /* @internal IP fragment processing, v4/v6 agonistic */
506 int
ipf_process(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt_ptr,struct ipf_key * key,uint16_t unfraglen,uint16_t fragoff,uint16_t fragpartlen,uint16_t fragflag,uint16_t * nfrags,uint16_t * tlen)507 ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr,
508 struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
509 uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen)
510 {
511 struct __kern_packet *pkt = *pkt_ptr;
512 struct __kern_packet *pkt_reassed = NULL;
513 struct ipfq *q, *mq = &mgr->ipfm_q;
514 struct ipf *f, *f_new, *f_down;
515 uint32_t nfrags_freed;
516 int next;
517 int first_frag = 0;
518 int err = 0;
519 int local_ipfq_unfraglen;
520
521 *nfrags = 0;
522
523 SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d "
524 "fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
525
526 /*
527 * Make sure that all fragments except last one have a data length
528 * that's a non-zero multiple of 8 bytes.
529 */
530 if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) {
531 SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
532 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
533 ipf_icmp_param_err(mgr, pkt,
534 offsetof(struct ip6_hdr, ip6_plen));
535 return ERANGE;
536 }
537
538 lck_mtx_lock(&mgr->ipfm_lock);
539
540 /* find ipfq */
541 for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
542 if (ipf_key_cmp(key, &q->ipfq_key) == 0) {
543 if (q->ipfq_is_dirty) {
544 SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
545 err = EINVAL;
546 goto done;
547 }
548 break;
549 }
550 }
551
552 /* not found, create new ipfq */
553 if (q == mq) {
554 first_frag = 1;
555
556 q = ipfq_alloc(mgr, M_DONTWAIT);
557 if (q == NULL) {
558 STATS_INC(mgr->ipfm_stats,
559 FSW_STATS_RX_FRAG_DROP_NOMEM);
560 err = ENOMEM;
561 goto done;
562 }
563
564 ipfq_insque(q, mq);
565 net_update_uptime();
566
567 bcopy(key, &q->ipfq_key, sizeof(struct ipf_key));
568 q->ipfq_down = q->ipfq_up = (struct ipf *)q;
569 q->ipfq_unfraglen = -1; /* The 1st fragment has not arrived. */
570 q->ipfq_nfrag = 0;
571 q->ipfq_timestamp = _net_uptime;
572 }
573
574 ASSERT(!q->ipfq_is_dirty);
575
576 /* this queue has reached per queue frag limit */
577 if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
578 nfrags_freed = ipfq_freefq(mgr, q, NULL);
579 STATS_ADD(mgr->ipfm_stats,
580 FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
581 err = ENOMEM;
582 goto done;
583 }
584
585 local_ipfq_unfraglen = q->ipfq_unfraglen;
586
587 /*
588 * If it's the 1st fragment, record the length of the
589 * unfragmentable part and the next header of the fragment header.
590 * Assume the first fragement to arrive will be correct.
591 * We do not have any duplicate checks here yet so another packet
592 * with fragoff == 0 could come and overwrite the ipfq_unfraglen
593 * and worse, the next header, at any time.
594 */
595 if (fragoff == 0 && local_ipfq_unfraglen == -1) {
596 local_ipfq_unfraglen = unfraglen;
597 }
598
599 /* Check that the reassembled packet would not exceed 65535 bytes. */
600 if (local_ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
601 SK_DF(SK_VERB_IP_FRAG, "frag too big");
602 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
603 ipf_icmp_param_err(mgr, pkt, sizeof(struct ip6_hdr) +
604 offsetof(struct ip6_frag, ip6f_offlg));
605 err = ERANGE;
606 goto done;
607 }
608
609 /*
610 * If it's the 1st fragment, do the above check for each
611 * fragment already stored in the reassembly queue.
612 * If an error is found, still return 0, since we don't return
613 * ownership of a chain of offending packets back to caller.
614 */
615 if (fragoff == 0) {
616 for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
617 f_down = f->ipf_down;
618 if (local_ipfq_unfraglen + f->ipf_off + f->ipf_len >
619 IP_MAXPACKET) {
620 SK_DF(SK_VERB_IP_FRAG, "frag too big");
621 STATS_INC(mgr->ipfm_stats,
622 FSW_STATS_RX_FRAG_BAD);
623 ipf_deq(f);
624 ipf_free_pkt(f);
625 ipf_free(mgr, f);
626 }
627 }
628 }
629
630 f_new = ipf_alloc(mgr);
631 if (f_new == NULL) {
632 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
633 err = ENOMEM;
634 goto done;
635 }
636
637 f_new->ipf_mff = fragflag;
638 f_new->ipf_off = fragoff;
639 f_new->ipf_len = fragpartlen;
640 f_new->ipf_pkt = pkt;
641
642 if (first_frag) {
643 f = (struct ipf *)q;
644 goto insert;
645 }
646
647 /* Find a segment which begins after this one does. */
648 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
649 if (f->ipf_off > f_new->ipf_off) {
650 break;
651 }
652 }
653
654 /*
655 * If any of the fragments being reassembled overlap with any
656 * other fragments being reassembled for the same packet,
657 * reassembly of that packet must be abandoned and all the
658 * fragments that have been received for that packet must be
659 * discarded, and no ICMP error messages should be sent.
660 *
661 * It should be noted that fragments may be duplicated in the
662 * network. Instead of treating these exact duplicate fragments
663 * as overlapping fragments, an implementation may choose to
664 * detect this case and drop exact duplicate fragments while
665 * keeping the other fragments belonging to the same packet.
666 *
667 * https://tools.ietf.org/html/rfc8200#appendix-B
668 *
669 * We apply this rule for both for IPv4 and IPv6 here.
670 */
671 if (((f->ipf_up != (struct ipf *)q) && /* prev frag spans into f_new */
672 (f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) ||
673 ((f != (struct ipf *)q) && /* f_new spans into next */
674 (f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) {
675 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
676 /* Check for exact duplicate offset/length */
677 if (((f->ipf_up != (struct ipf *)q) &&
678 ((f->ipf_up->ipf_off != f_new->ipf_off) ||
679 (f->ipf_up->ipf_len != f_new->ipf_len))) ||
680 ((f != (struct ipf *)q) &&
681 ((f->ipf_off != f_new->ipf_off) ||
682 (f->ipf_len != f_new->ipf_len)))) {
683 SK_DF(SK_VERB_IP_FRAG, "frag overlap");
684 ipf_free(mgr, f_new);
685 /* give up over-lapping fragments queue */
686 SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
687 ipfq_freef(mgr, q, NULL);
688 q->ipfq_is_dirty = true;
689 } else {
690 ipf_free(mgr, f_new);
691 SK_DF(SK_VERB_IP_FRAG, "frag dup");
692 }
693 err = ERANGE;
694 goto done;
695 }
696
697 insert:
698 q->ipfq_unfraglen = local_ipfq_unfraglen;
699
700 /*
701 * Stick new segment in its place;
702 * check for complete reassembly.
703 * Move to front of packet queue, as we are
704 * the most recently active fragmented packet.
705 */
706 ipf_enq(f_new, f->ipf_up);
707 q->ipfq_nfrag++;
708 next = 0;
709 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
710 /* there is a hole */
711 if (f->ipf_off != next) {
712 goto done;
713 }
714 next += f->ipf_len;
715 }
716 /* we haven't got last frag yet */
717 if (f->ipf_up->ipf_mff) {
718 goto done;
719 }
720
721 /*
722 * Reassembly is complete; concatenate fragments.
723 */
724 f = q->ipfq_down;
725 f_down = f->ipf_down;
726 pkt_reassed = f->ipf_pkt;
727 *nfrags = 1;
728 while (f_down != (struct ipf *)q) {
729 /* chain __kern_packet with pkt_nextpkt ptr */
730 f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
731 (*nfrags)++;
732 (*tlen) += f_down->ipf_len;
733 f_down = f->ipf_down;
734 ipf_deq(f);
735 ipf_free(mgr, f);
736 f = f_down;
737 f_down = f->ipf_down;
738 }
739 ipf_free(mgr, f);
740
741 err = 0;
742 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
743 ipfq_remque(q);
744 ipfq_free(mgr, q);
745
746 done:
747 /* ipfm take ownership of, or return assembled packet, if no error */
748 if (err == 0) {
749 /* reass'ed packet if done; NULL otherwise */
750 *pkt_ptr = pkt_reassed;
751 }
752 ipfq_sched_timeout(mgr, FALSE);
753 lck_mtx_unlock(&mgr->ipfm_lock);
754 return err;
755 }
756
757 static int
ipf_key_cmp(struct ipf_key * a,struct ipf_key * b)758 ipf_key_cmp(struct ipf_key *a, struct ipf_key *b)
759 {
760 int d;
761
762 if ((d = (a->ipfk_len - b->ipfk_len)) != 0) {
763 return d;
764 }
765
766 if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) {
767 return d;
768 }
769
770 return memcmp(a->ipfk_addr, b->ipfk_addr, a->ipfk_len);
771 }
772
773 /*
774 * Put an ip fragment on a reassembly chain.
775 * Like insque, but pointers in middle of structure.
776 */
777 static void
ipf_enq(struct ipf * f,struct ipf * up6)778 ipf_enq(struct ipf *f, struct ipf *up6)
779 {
780 f->ipf_up = up6;
781 f->ipf_down = up6->ipf_down;
782 up6->ipf_down->ipf_up = f;
783 up6->ipf_down = f;
784 }
785
786 /*
787 * To ipf_enq as remque is to insque.
788 */
789 static void
ipf_deq(struct ipf * f)790 ipf_deq(struct ipf *f)
791 {
792 f->ipf_up->ipf_down = f->ipf_down;
793 f->ipf_down->ipf_up = f->ipf_up;
794 }
795
796 static void
ipfq_insque(struct ipfq * new,struct ipfq * old)797 ipfq_insque(struct ipfq *new, struct ipfq *old)
798 {
799 new->ipfq_prev = old;
800 new->ipfq_next = old->ipfq_next;
801 old->ipfq_next->ipfq_prev = new;
802 old->ipfq_next = new;
803 }
804
805 static void
ipfq_remque(struct ipfq * p6)806 ipfq_remque(struct ipfq *p6)
807 {
808 p6->ipfq_prev->ipfq_next = p6->ipfq_next;
809 p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
810 }
811
812 /*
813 * @internal drain reassembly queue till reaching target q count.
814 */
815 static void
_ipfq_reap(struct fsw_ip_frag_mgr * mgr,uint32_t target_q_count,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))816 _ipfq_reap(struct fsw_ip_frag_mgr *mgr, uint32_t target_q_count,
817 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
818 {
819 uint32_t n_freed = 0;
820
821 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
822
823 SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
824 mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
825 mgr->ipfm_q_limit);
826
827 while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q &&
828 mgr->ipfm_q_count > target_q_count) {
829 n_freed += ipfq_freefq(mgr, mgr->ipfm_q.ipfq_prev,
830 mgr->ipfm_q.ipfq_prev->ipfq_is_dirty ? NULL : ipf_cb);
831 }
832
833 STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_REAPED, n_freed);
834 }
835
836 /*
837 * @internal reap half reassembly queues to allow newer fragment assembly.
838 */
839 static void
ipfq_reap(struct fsw_ip_frag_mgr * mgr)840 ipfq_reap(struct fsw_ip_frag_mgr *mgr)
841 {
842 _ipfq_reap(mgr, mgr->ipfm_q_count / 2, ipf_icmp_timeout_err);
843 }
844
845 /*
846 * @internal reap all reassembly queues, for shutdown etc.
847 */
848 static void
ipfq_drain(struct fsw_ip_frag_mgr * mgr)849 ipfq_drain(struct fsw_ip_frag_mgr *mgr)
850 {
851 _ipfq_reap(mgr, 0, NULL);
852 }
853
854 static void
ipfq_timeout(thread_call_param_t arg0,thread_call_param_t arg1)855 ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
856 {
857 #pragma unused(arg1)
858 struct fsw_ip_frag_mgr *mgr = arg0;
859 struct ipfq *q;
860 uint64_t now, elapsed;
861 uint32_t n_freed = 0;
862
863 net_update_uptime();
864 now = _net_uptime;
865
866 SK_DF(SK_VERB_IP_FRAG, "run");
867 lck_mtx_lock(&mgr->ipfm_lock);
868 q = mgr->ipfm_q.ipfq_next;
869 if (q) {
870 while (q != &mgr->ipfm_q) {
871 q = q->ipfq_next;
872 elapsed = now - q->ipfq_prev->ipfq_timestamp;
873 if (elapsed > ipfm_frag_ttl) {
874 SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
875 q->ipfq_prev->ipfq_key.ipfk_ident);
876 n_freed = ipfq_freefq(mgr, q->ipfq_prev,
877 q->ipfq_is_dirty ? NULL :
878 ipf_icmp_timeout_err);
879 }
880 }
881 }
882 STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, n_freed);
883
884 /* If running out of resources, drain ipfm queues (oldest one first) */
885 if (mgr->ipfm_f_count >= mgr->ipfm_f_limit ||
886 mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
887 ipfq_reap(mgr);
888 }
889
890 /* re-arm the purge timer if there's work to do */
891 if (mgr->ipfm_q_count > 0) {
892 ipfq_sched_timeout(mgr, TRUE);
893 }
894 lck_mtx_unlock(&mgr->ipfm_lock);
895 }
896
897 static void
ipfq_sched_timeout(struct fsw_ip_frag_mgr * mgr,boolean_t in_tcall)898 ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
899 {
900 uint32_t delay = MAX(1, ipfm_timeout_tcall_ival); /* seconds */
901 thread_call_t tcall = mgr->ipfm_timeout_tcall;
902 uint64_t now = mach_absolute_time();
903 uint64_t ival, deadline = now;
904
905 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
906
907 ASSERT(tcall != NULL);
908 if (mgr->ipfm_q_count > 0 &&
909 (!thread_call_isactive(tcall) || in_tcall)) {
910 nanoseconds_to_absolutetime(delay * NSEC_PER_SEC, &ival);
911 clock_deadline_for_periodic_event(ival, now, &deadline);
912 (void) thread_call_enter_delayed(tcall, deadline);
913 }
914 }
915
916 static int
917 ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
918 {
919 #pragma unused(oidp, arg2)
920 struct fsw_ip_frag_mgr *mgr = arg1;
921
922 SKOID_PROC_CALL_GUARD;
923
924 lck_mtx_lock(&mgr->ipfm_lock);
925 ipfq_drain(mgr);
926 lck_mtx_unlock(&mgr->ipfm_lock);
927
928 return 0;
929 }
930
931 static struct ipfq *
ipfq_alloc(struct fsw_ip_frag_mgr * mgr,int how)932 ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how)
933 {
934 struct mbuf *t;
935 struct ipfq *q;
936
937 if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
938 ipfq_reap(mgr);
939 }
940 ASSERT(mgr->ipfm_q_count <= mgr->ipfm_q_limit);
941
942 t = m_get(how, MT_FTABLE);
943 if (t != NULL) {
944 mgr->ipfm_q_count++;
945 q = mtod(t, struct ipfq *);
946 bzero(q, sizeof(*q));
947 q->ipfq_is_dirty = false;
948 } else {
949 q = NULL;
950 }
951 return q;
952 }
953
954 /* free q */
955 static void
ipfq_free(struct fsw_ip_frag_mgr * mgr,struct ipfq * q)956 ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q)
957 {
958 (void) m_free(dtom(q));
959 mgr->ipfm_q_count--;
960 }
961
962 /*
963 * Free all fragments, keep q.
964 * @return: number of frags freed
965 */
966 static uint32_t
ipfq_freef(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))967 ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
968 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
969 {
970 struct ipf *f, *down6;
971 uint32_t nfrags = 0;
972
973 for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
974 nfrags++;
975 down6 = f->ipf_down;
976 ipf_deq(f);
977 if (ipf_cb != NULL) {
978 (*ipf_cb)(mgr, f);
979 }
980 ipf_free_pkt(f);
981 ipf_free(mgr, f);
982 }
983
984 return nfrags;
985 }
986
987 /* Free both all fragments and q
988 * @return: number of frags freed
989 */
990 static uint32_t
ipfq_freefq(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))991 ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
992 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
993 {
994 uint32_t freed_count;
995 freed_count = ipfq_freef(mgr, q, ipf_cb);
996 ipfq_remque(q);
997 ipfq_free(mgr, q);
998 return freed_count;
999 }
1000
1001 static struct ipf *
ipf_alloc(struct fsw_ip_frag_mgr * mgr)1002 ipf_alloc(struct fsw_ip_frag_mgr *mgr)
1003 {
1004 struct mbuf *t;
1005 struct ipf *f;
1006
1007 if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
1008 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
1009 return NULL;
1010 }
1011
1012 t = m_get(M_DONTWAIT, MT_FTABLE);
1013 if (t != NULL) {
1014 mgr->ipfm_f_count++;
1015 f = mtod(t, struct ipf *);
1016 bzero(f, sizeof(*f));
1017 } else {
1018 f = NULL;
1019 }
1020 return f;
1021 }
1022
1023 static void
ipf_free_pkt(struct ipf * f)1024 ipf_free_pkt(struct ipf *f)
1025 {
1026 struct __kern_packet *pkt = f->ipf_pkt;
1027 ASSERT(pkt != NULL);
1028 pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
1029 SK_PTR_ADDR(pkt));
1030 }
1031
1032 static void
ipf_free(struct fsw_ip_frag_mgr * mgr,struct ipf * f)1033 ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
1034 {
1035 (void) m_free(dtom(f));
1036 mgr->ipfm_f_count--;
1037 }
1038