xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */
30 /* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
31 
32 /*
33  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. Neither the name of the project nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  */
60 
61 /*
62  * @file
63  * flowswitch IP Reassembly for both v4 and v6
64  *
65  * Implementation of IP packet fragmentation and reassembly.
66  *
67  */
68 
69 #include <sys/domain.h>
70 #include <netinet/in.h>
71 #include <netinet/ip6.h>
72 #include <netinet/icmp6.h>
73 #include <skywalk/os_skywalk_private.h>
74 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
75 #include <skywalk/nexus/flowswitch/fsw_var.h>
76 
77 #define IPFM_MAX_FRAGS_PER_QUEUE        128     /* RFC 791 64K/(512 min MTU) */
78 #define IPFM_MAX_QUEUES                 1024    /* same as ip/ip6 */
79 #define IPFM_FRAG_TTL                   60      /* RFC 2460 */
80 #define IPFM_TIMEOUT_TCALL_INTERVAL     1
81 
82 static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
83 static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
84 static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
85 
86 #if (DEVELOPMENT || DEBUG)
87 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
88     ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED,
89     &ipfm_max_frags_per_queue, 0, "");
90 #endif /* !DEVELOPMENT && !DEBUG */
91 
92 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
93     CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, "");
94 SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
95     ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED,
96     &ipfm_timeout_tcall_ival, 0, "");
97 
98 static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
99 static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0);
100 
101 /* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */
102 struct ipf {
103 	struct ipf      *ipf_down;
104 	struct ipf      *ipf_up;
105 	struct __kern_packet *ipf_pkt;
106 	int             ipf_len;        /* fragmentable part length */
107 	int             ipf_off;        /* fragment offset */
108 	uint16_t        ipf_mff;        /* more fragment bit in frag off */
109 };
110 
111 /* @internal ip fragment lookup key */
112 struct ipf_key {
113 	uint64_t        ipfk_addr[4];   /* src + dst ip addr (v4/v6) */
114 	uint32_t        ipfk_ident;     /* IP identification */
115 	uint16_t        ipfk_len;       /* len of ipfk_addr field */
116 };
117 
118 enum {
119 	IPFK_LEN_V4 = 2 * sizeof(struct in_addr),
120 	IPFK_LEN_V6 = 2 * sizeof(struct in6_addr),
121 };
122 
123 /*
124  * @internal
125  * IP reassembly queue structure.  Each fragment (struct ipf)
126  * being reassembled is attached to one of these structures.
127  */
128 struct ipfq {
129 	struct ipf      *ipfq_down;     /* fragment chain */
130 	struct ipf      *ipfq_up;
131 	struct ipfq     *ipfq_next;     /* queue chain */
132 	struct ipfq     *ipfq_prev;
133 	uint64_t        ipfq_timestamp; /* time of creation */
134 	struct ipf_key  ipfq_key;       /* ipfq search key */
135 	uint16_t        ipfq_nfrag;     /* # of fragments in queue */
136 	uint16_t        ipfq_unfraglen; /* len of unfragmentable part */
137 	bool            ipfq_is_dirty;  /* q is dirty, don't use */
138 };
139 
140 /*
141  * @internal (externally opaque)
142  * flowswitch IP Fragment Manager
143  */
144 struct fsw_ip_frag_mgr {
145 	struct skoid    ipfm_skoid;
146 	struct ipfq     ipfm_q;         /* ip reassembly queues */
147 	uint32_t        ipfm_q_limit;   /* limit # of reass queues */
148 	uint32_t        ipfm_q_count;   /* # of allocated reass queues */
149 	uint32_t        ipfm_f_limit;   /* limit # of ipfs */
150 	uint32_t        ipfm_f_count;   /* current # of allocated ipfs */
151 	decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */
152 	thread_call_t   ipfm_timeout_tcall;     /* frag timeout thread */
153 
154 	struct ifnet    *ipfm_ifp;
155 	struct fsw_stats *ipfm_stats;   /* indirect stats in fsw */
156 };
157 
158 static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **,
159     struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *,
160     uint16_t *);
161 static int ipf_key_cmp(struct ipf_key *, struct ipf_key *);
162 static void ipf_enq(struct ipf *, struct ipf *);
163 static void ipf_deq(struct ipf *);
164 static void ipfq_insque(struct ipfq *, struct ipfq *);
165 static void ipfq_remque(struct ipfq *);
166 static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *,
167     void (*)(struct fsw_ip_frag_mgr *, struct ipf *));
168 
169 static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
170 static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
171 
172 static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how);
173 static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q);
174 static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
175     void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *));
176 static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr);
177 static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f);
178 static void ipf_free_pkt(struct ipf *f);
179 static void ipfq_drain(struct fsw_ip_frag_mgr *);
180 static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
181 void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt,
182     int param);
183 void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f);
184 
185 /* Create a flowswitch IP fragment manager. */
186 struct fsw_ip_frag_mgr *
fsw_ip_frag_mgr_create(struct nx_flowswitch * fsw,struct ifnet * ifp,size_t f_limit)187 fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp,
188     size_t f_limit)
189 {
190 	struct fsw_ip_frag_mgr *mgr;
191 
192 	/* ipf/ipfq uses mbufs for IP fragment queue structures */
193 	_CASSERT(sizeof(struct ipfq) <= _MLEN);
194 	_CASSERT(sizeof(struct ipf) <= _MLEN);
195 
196 	ASSERT(ifp != NULL);
197 
198 	mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL,
199 	    skmem_tag_fsw_frag_mgr);
200 
201 	mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
202 	lck_mtx_init(&mgr->ipfm_lock, &fsw_ipfm_lock_group, &fsw_ipfm_lock_attr);
203 
204 	mgr->ipfm_timeout_tcall =
205 	    thread_call_allocate_with_options(ipfq_timeout, mgr,
206 	    THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
207 	VERIFY(mgr->ipfm_timeout_tcall != NULL);
208 
209 	mgr->ipfm_ifp = ifp;
210 	mgr->ipfm_stats = &fsw->fsw_stats;
211 
212 	/* Use caller provided limit (caller knows pool size) */
213 	ASSERT(f_limit >= 2 && f_limit < UINT32_MAX);
214 	mgr->ipfm_f_limit = (uint32_t)f_limit;
215 	mgr->ipfm_f_count = 0;
216 	mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2);
217 	mgr->ipfm_q_count = 0;
218 
219 	skoid_create(&mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), "ipfm", 0);
220 	skoid_add_uint(&mgr->ipfm_skoid, "frag_limit", CTLFLAG_RW,
221 	    &mgr->ipfm_f_limit);
222 	skoid_add_uint(&mgr->ipfm_skoid, "frag_count", CTLFLAG_RD,
223 	    &mgr->ipfm_f_count);
224 	skoid_add_uint(&mgr->ipfm_skoid, "queue_limit", CTLFLAG_RW,
225 	    &mgr->ipfm_q_limit);
226 	skoid_add_uint(&mgr->ipfm_skoid, "queue_count", CTLFLAG_RD,
227 	    &mgr->ipfm_q_count);
228 	skoid_add_handler(&mgr->ipfm_skoid, "drain", CTLFLAG_RW,
229 	    ipfq_drain_sysctl, mgr, 0);
230 
231 	return mgr;
232 }
233 
234 /* Free a flowswitch IP fragment manager. */
235 void
fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr * mgr)236 fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
237 {
238 	thread_call_t tcall;
239 
240 	lck_mtx_lock(&mgr->ipfm_lock);
241 	if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
242 		lck_mtx_unlock(&mgr->ipfm_lock);
243 		(void) thread_call_cancel_wait(tcall);
244 		(void) thread_call_free(tcall);
245 		mgr->ipfm_timeout_tcall = NULL;
246 		lck_mtx_lock(&mgr->ipfm_lock);
247 	}
248 
249 	ipfq_drain(mgr);
250 
251 	lck_mtx_unlock(&mgr->ipfm_lock);
252 	lck_mtx_destroy(&mgr->ipfm_lock, &fsw_ipfm_lock_group);
253 
254 	skoid_destroy(&mgr->ipfm_skoid);
255 	sk_free_type(struct fsw_ip_frag_mgr, mgr);
256 }
257 
258 /*
259  * Reassemble a received IPv4 fragment.
260  *
261  * @param mgr
262  *   fragment manager
263  * @param pkt
264  *   received packet (must have ipv4 header validated)
265  * @param ip4
266  *   pointer to the packet's IPv4 header
267  * @param nfrags
268  *   number of fragments reassembled
269  * @return
270  *   Successfully processed (not fully reassembled)
271  *     ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
272  *   Successfully reassembled
273  *     ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
274  *     *nfrags = number of all fragments (>0)
275  *   Error
276  *     ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
277  *     *nfrags = 0
278  */
279 int
fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip * ip4,uint16_t * nfrags,uint16_t * tlen)280 fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
281     struct ip *ip4, uint16_t *nfrags, uint16_t *tlen)
282 {
283 	struct ipf_key key;
284 	uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
285 	int err;
286 
287 	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
288 
289 	bcopy((void *)&ip4->ip_src, (void *)key.ipfk_addr, IPFK_LEN_V4);
290 	key.ipfk_len = IPFK_LEN_V4;
291 	key.ipfk_ident = (uint32_t)ip4->ip_id;
292 
293 	unfragpartlen = (uint16_t)(ip4->ip_hl << 2);
294 	offflag = ntohs(ip4->ip_off);
295 	fragoff = (uint16_t)(offflag << 3);
296 	fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2);
297 	fragflag = offflag & IP_MF;
298 
299 	err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
300 	    fragflag, nfrags, tlen);
301 
302 	/*
303 	 * If packet has been reassembled compute the user data length.
304 	 */
305 	if (*pkt != NULL) {
306 		struct __kern_packet *p = *pkt;
307 		struct ip *iph = (struct ip *)p->pkt_flow_ip_hdr;
308 
309 		p->pkt_flow_ulen = ntohs(iph->ip_len) -
310 		    p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
311 	}
312 	return err;
313 }
314 
315 /*
316  * Reassemble a received IPv6 fragment.
317  *
318  * @param mgr
319  *   fragment manager
320  * @param pkt
321  *   received packet (must have ipv6 header validated)
322  * @param ip6
323  *   pointer to the packet's IPv6 header
324  * @param ip6f
325  *   pointer to the packet's IPv6 Fragment Header
326  * @param nfrags
327  *   number of fragments reassembled
328  * @return
329  *   Successfully processed (not fully reassembled)
330  *     ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
331  *   Successfully reassembled
332  *     ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
333  *     *nfrags = number of all fragments (>0)
334  *   Error
335  *     ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
336  *     *nfrags = 0
337  */
338 int
fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt,struct ip6_hdr * ip6,struct ip6_frag * ip6f,uint16_t * nfrags,uint16_t * tlen)339 fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
340     struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags,
341     uint16_t *tlen)
342 {
343 	struct ipf_key key;
344 	ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
345 	uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
346 	int err;
347 
348 	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
349 
350 	/* jumbo payload can't contain a fragment header */
351 	if (ip6->ip6_plen == 0) {
352 		*nfrags = 0;
353 		return ERANGE;
354 	}
355 
356 	ASSERT(ip6f_ptroff < UINT16_MAX);
357 	ip6f_off = (uint16_t)ip6f_ptroff;
358 	fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
359 	fragpartlen = ntohs(ip6->ip6_plen) -
360 	    (ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
361 	unfragpartlen = ip6f_off;
362 	fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
363 
364 	/*
365 	 * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
366 	 * upfront, unrelated to any reassembly.
367 	 *
368 	 * Flow classifier should process those as non-frag, ipfm shouldn't see
369 	 * them.
370 	 */
371 	ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0);
372 
373 	bcopy((void *)&ip6->ip6_src, (void *)key.ipfk_addr, IPFK_LEN_V6);
374 	key.ipfk_len = IPFK_LEN_V6;
375 	key.ipfk_ident = ip6f->ip6f_ident;
376 
377 	err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
378 	    fragflag, nfrags, tlen);
379 
380 	/*
381 	 * If packet has been reassembled compute the user data length.
382 	 */
383 	if (*pkt != NULL) {
384 		struct __kern_packet *p = *pkt;
385 		struct ip6_hdr *ip6h = (struct ip6_hdr *)p->pkt_flow_ip_hdr;
386 
387 		p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
388 		    p->pkt_flow->flow_l4._l4_hlen;
389 	}
390 	return err;
391 }
392 
393 static struct mbuf *
ipf_pkt2mbuf(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt)394 ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt)
395 {
396 	unsigned int one = 1;
397 	struct mbuf *m = NULL;
398 	uint8_t *buf;
399 	struct ip6_hdr *ip6;
400 	uint32_t l3t_len;
401 	int err;
402 
403 	l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
404 	if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
405 		l3t_len -= ETHER_CRC_LEN;
406 	}
407 
408 	err = mbuf_allocpacket(MBUF_WAITOK, l3t_len, &one, &m);
409 	VERIFY(err == 0);
410 	ASSERT(l3t_len <= mbuf_maxlen(m));
411 
412 	if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
413 		bcopy(pkt->pkt_mbuf->m_data + pkt->pkt_l2_len,
414 		    m->m_data, l3t_len);
415 	} else {
416 		MD_BUFLET_ADDR_ABS(pkt, buf);
417 		buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
418 		bcopy(buf, m->m_data, l3t_len);
419 	}
420 	m->m_pkthdr.len = m->m_len = l3t_len;
421 
422 	ip6 = mtod(m, struct ip6_hdr *);
423 	/* note for casting: IN6_IS_SCOPE_ doesn't need alignment */
424 	if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
425 		if (in6_embedded_scope) {
426 			ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
427 		}
428 		ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
429 	}
430 	if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
431 		if (in6_embedded_scope) {
432 			ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
433 		}
434 		ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
435 	}
436 
437 	return m;
438 }
439 
440 /*
441  * Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
442  * we need to ensure we don't enter the driver directly because a deadlock
443  * can happen if this same thread tries to get the workloop lock.
444  */
445 static void
ipf_icmp6_error_flag(struct mbuf * m,int type,int code,int param,int flags)446 ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
447 {
448 	sk_protect_t protect = sk_async_transmit_protect();
449 	icmp6_error_flag(m, type, code, param, flags);
450 	sk_async_transmit_unprotect(protect);
451 }
452 
453 /*
454  * @internal IP fragment ICMP parameter problem error handling
455  *
456  * @param param
457  *   offending parameter offset, only applicable to ICMPv6
458  */
459 void
ipf_icmp_param_err(struct fsw_ip_frag_mgr * mgr,struct __kern_packet * pkt,int param_offset)460 ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt,
461     int param_offset)
462 {
463 	if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
464 		return;
465 	}
466 
467 	struct mbuf *m = NULL;
468 	m = ipf_pkt2mbuf(mgr, pkt);
469 	if (__probable(m != NULL)) {
470 		ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
471 		    param_offset, 0);
472 	}
473 
474 	/* m would be free by icmp6_error_flag function */
475 }
476 
477 /* @internal IP fragment ICMP timeout error handling */
478 void
ipf_icmp_timeout_err(struct fsw_ip_frag_mgr * mgr,struct ipf * f)479 ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
480 {
481 	struct __kern_packet *pkt = f->ipf_pkt;
482 	ASSERT(pkt != NULL);
483 
484 	/* no icmp error packet for ipv4 */
485 	if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
486 		return;
487 	}
488 
489 	/* only for the first fragment */
490 	if (f->ipf_off != 0) {
491 		return;
492 	}
493 
494 	struct mbuf *m = NULL;
495 	m = ipf_pkt2mbuf(mgr, pkt);
496 	if (__probable(m != NULL)) {
497 		ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
498 		    ICMP6_TIME_EXCEED_REASSEMBLY, 0, 0);
499 	}
500 
501 	/* m would be free by icmp6_error_flag function */
502 }
503 
504 /* @internal IP fragment processing, v4/v6 agonistic */
505 int
ipf_process(struct fsw_ip_frag_mgr * mgr,struct __kern_packet ** pkt_ptr,struct ipf_key * key,uint16_t unfraglen,uint16_t fragoff,uint16_t fragpartlen,uint16_t fragflag,uint16_t * nfrags,uint16_t * tlen)506 ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr,
507     struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
508     uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen)
509 {
510 	struct __kern_packet *pkt = *pkt_ptr;
511 	struct __kern_packet *pkt_reassed = NULL;
512 	struct ipfq *q, *mq = &mgr->ipfm_q;
513 	struct ipf *f, *f_new, *f_down;
514 	uint32_t nfrags_freed;
515 	int next;
516 	int first_frag = 0;
517 	int err = 0;
518 
519 	*nfrags = 0;
520 
521 	SK_DF(SK_VERB_IP_FRAG, "id %5d  fragoff %5d  fragpartlen %5d  "
522 	    "fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
523 
524 	/*
525 	 * Make sure that all fragments except last one have a data length
526 	 * that's a non-zero multiple of 8 bytes.
527 	 */
528 	if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) {
529 		SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
530 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
531 		ipf_icmp_param_err(mgr, pkt,
532 		    offsetof(struct ip6_hdr, ip6_plen));
533 		return ERANGE;
534 	}
535 
536 	lck_mtx_lock(&mgr->ipfm_lock);
537 
538 	/* find ipfq */
539 	for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
540 		if (ipf_key_cmp(key, &q->ipfq_key) == 0) {
541 			if (q->ipfq_is_dirty) {
542 				SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
543 				err = EINVAL;
544 				goto done;
545 			}
546 			break;
547 		}
548 	}
549 
550 	/* not found, create new ipfq */
551 	if (q == mq) {
552 		first_frag = 1;
553 
554 		q = ipfq_alloc(mgr, M_DONTWAIT);
555 		if (q == NULL) {
556 			STATS_INC(mgr->ipfm_stats,
557 			    FSW_STATS_RX_FRAG_DROP_NOMEM);
558 			err = ENOMEM;
559 			goto done;
560 		}
561 
562 		ipfq_insque(q, mq);
563 		net_update_uptime();
564 
565 		bcopy(key, &q->ipfq_key, sizeof(struct ipf_key));
566 		q->ipfq_down = q->ipfq_up = (struct ipf *)q;
567 		q->ipfq_unfraglen = 0;
568 		q->ipfq_nfrag = 0;
569 		q->ipfq_timestamp = _net_uptime;
570 	}
571 
572 	ASSERT(!q->ipfq_is_dirty);
573 
574 	/* this queue has reached per queue frag limit */
575 	if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
576 		nfrags_freed = ipfq_freefq(mgr, q, NULL);
577 		STATS_ADD(mgr->ipfm_stats,
578 		    FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
579 		err = ENOMEM;
580 		goto done;
581 	}
582 
583 	/*
584 	 * If it's the 1st fragment, record the length of the
585 	 * unfragmentable part and the next header of the fragment header.
586 	 */
587 	if (fragoff == 0) {
588 		q->ipfq_unfraglen = unfraglen;
589 	}
590 
591 	/* Check that the reassembled packet would not exceed 65535 bytes. */
592 	if (q->ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
593 		SK_DF(SK_VERB_IP_FRAG, "frag too big");
594 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
595 		ipf_icmp_param_err(mgr, pkt, sizeof(struct ip6_hdr) +
596 		    offsetof(struct ip6_frag, ip6f_offlg));
597 		err = ERANGE;
598 		goto done;
599 	}
600 
601 	/*
602 	 * If it's the 1st fragment, do the above check for each
603 	 * fragment already stored in the reassembly queue.
604 	 * If an error is found, still return 0, since we don't return
605 	 * ownership of a chain of offending packets back to caller.
606 	 */
607 	if (fragoff == 0) {
608 		for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
609 			f_down = f->ipf_down;
610 			if (q->ipfq_unfraglen + f->ipf_off + f->ipf_len >
611 			    IP_MAXPACKET) {
612 				SK_DF(SK_VERB_IP_FRAG, "frag too big");
613 				STATS_INC(mgr->ipfm_stats,
614 				    FSW_STATS_RX_FRAG_BAD);
615 				ipf_deq(f);
616 				ipf_free_pkt(f);
617 				ipf_free(mgr, f);
618 			}
619 		}
620 	}
621 
622 	f_new = ipf_alloc(mgr);
623 	if (f_new == NULL) {
624 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
625 		err = ENOMEM;
626 		goto done;
627 	}
628 
629 	f_new->ipf_mff = fragflag;
630 	f_new->ipf_off = fragoff;
631 	f_new->ipf_len = fragpartlen;
632 	f_new->ipf_pkt = pkt;
633 
634 	if (first_frag) {
635 		f = (struct ipf *)q;
636 		goto insert;
637 	}
638 
639 	/* Find a segment which begins after this one does. */
640 	for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
641 		if (f->ipf_off > f_new->ipf_off) {
642 			break;
643 		}
644 	}
645 
646 	/*
647 	 * If any of the fragments being reassembled overlap with any
648 	 * other fragments being reassembled for the same packet,
649 	 * reassembly of that packet must be abandoned and all the
650 	 * fragments that have been received for that packet must be
651 	 * discarded, and no ICMP error messages should be sent.
652 	 *
653 	 * It should be noted that fragments may be duplicated in the
654 	 * network.  Instead of treating these exact duplicate fragments
655 	 * as overlapping fragments, an implementation may choose to
656 	 * detect this case and drop exact duplicate fragments while
657 	 * keeping the other fragments belonging to the same packet.
658 	 *
659 	 * https://tools.ietf.org/html/rfc8200#appendix-B
660 	 *
661 	 * We apply this rule for both for IPv4 and IPv6 here.
662 	 */
663 	if (((f->ipf_up != (struct ipf *)q) &&  /* prev frag spans into f_new */
664 	    (f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) ||
665 	    ((f != (struct ipf *)q) &&  /* f_new spans into next */
666 	    (f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) {
667 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
668 		/* Check for exact duplicate offset/length */
669 		if (((f->ipf_up != (struct ipf *)q) &&
670 		    ((f->ipf_up->ipf_off != f_new->ipf_off) ||
671 		    (f->ipf_up->ipf_len != f_new->ipf_len))) ||
672 		    ((f != (struct ipf *)q) &&
673 		    ((f->ipf_off != f_new->ipf_off) ||
674 		    (f->ipf_len != f_new->ipf_len)))) {
675 			SK_DF(SK_VERB_IP_FRAG, "frag overlap");
676 			ipf_free(mgr, f_new);
677 			/* give up over-lapping fragments queue */
678 			SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
679 			ipfq_freef(mgr, q, NULL);
680 			q->ipfq_is_dirty = true;
681 		} else {
682 			ipf_free(mgr, f_new);
683 			SK_DF(SK_VERB_IP_FRAG, "frag dup");
684 		}
685 		err = ERANGE;
686 		goto done;
687 	}
688 
689 insert:
690 	/*
691 	 * Stick new segment in its place;
692 	 * check for complete reassembly.
693 	 * Move to front of packet queue, as we are
694 	 * the most recently active fragmented packet.
695 	 */
696 	ipf_enq(f_new, f->ipf_up);
697 	q->ipfq_nfrag++;
698 	next = 0;
699 	for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
700 		/* there is a hole */
701 		if (f->ipf_off != next) {
702 			goto done;
703 		}
704 		next += f->ipf_len;
705 	}
706 	/* we haven't got last frag yet */
707 	if (f->ipf_up->ipf_mff) {
708 		goto done;
709 	}
710 
711 	/*
712 	 * Reassembly is complete; concatenate fragments.
713 	 */
714 	f = q->ipfq_down;
715 	f_down = f->ipf_down;
716 	pkt_reassed = f->ipf_pkt;
717 	*nfrags = 1;
718 	while (f_down != (struct ipf *)q) {
719 		/* chain __kern_packet with pkt_nextpkt ptr */
720 		f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
721 		(*nfrags)++;
722 		(*tlen) += f_down->ipf_len;
723 		f_down = f->ipf_down;
724 		ipf_deq(f);
725 		ipf_free(mgr, f);
726 		f = f_down;
727 		f_down = f->ipf_down;
728 	}
729 	ipf_free(mgr, f);
730 
731 	err = 0;
732 	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
733 	ipfq_remque(q);
734 	ipfq_free(mgr, q);
735 
736 done:
737 	/* ipfm take ownership of, or return assembled packet, if no error */
738 	if (err == 0) {
739 		/* reass'ed packet if done; NULL otherwise */
740 		*pkt_ptr = pkt_reassed;
741 	}
742 	ipfq_sched_timeout(mgr, FALSE);
743 	lck_mtx_unlock(&mgr->ipfm_lock);
744 	return err;
745 }
746 
747 static int
ipf_key_cmp(struct ipf_key * a,struct ipf_key * b)748 ipf_key_cmp(struct ipf_key *a, struct ipf_key *b)
749 {
750 	int d;
751 
752 	if ((d = (a->ipfk_len - b->ipfk_len)) != 0) {
753 		return d;
754 	}
755 
756 	if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) {
757 		return d;
758 	}
759 
760 	return memcmp(a->ipfk_addr, b->ipfk_addr, a->ipfk_len);
761 }
762 
763 /*
764  * Put an ip fragment on a reassembly chain.
765  * Like insque, but pointers in middle of structure.
766  */
767 static void
ipf_enq(struct ipf * f,struct ipf * up6)768 ipf_enq(struct ipf *f, struct ipf *up6)
769 {
770 	f->ipf_up = up6;
771 	f->ipf_down = up6->ipf_down;
772 	up6->ipf_down->ipf_up = f;
773 	up6->ipf_down = f;
774 }
775 
776 /*
777  * To ipf_enq as remque is to insque.
778  */
779 static void
ipf_deq(struct ipf * f)780 ipf_deq(struct ipf *f)
781 {
782 	f->ipf_up->ipf_down = f->ipf_down;
783 	f->ipf_down->ipf_up = f->ipf_up;
784 }
785 
786 static void
ipfq_insque(struct ipfq * new,struct ipfq * old)787 ipfq_insque(struct ipfq *new, struct ipfq *old)
788 {
789 	new->ipfq_prev = old;
790 	new->ipfq_next = old->ipfq_next;
791 	old->ipfq_next->ipfq_prev = new;
792 	old->ipfq_next = new;
793 }
794 
795 static void
ipfq_remque(struct ipfq * p6)796 ipfq_remque(struct ipfq *p6)
797 {
798 	p6->ipfq_prev->ipfq_next = p6->ipfq_next;
799 	p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
800 }
801 
802 static void
ipfq_timeout(thread_call_param_t arg0,thread_call_param_t arg1)803 ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
804 {
805 #pragma unused(arg1)
806 	struct fsw_ip_frag_mgr *mgr = arg0;
807 	struct ipfq *q;
808 	uint64_t now, elapsed;
809 	uint32_t nfreed = 0;
810 
811 	net_update_uptime();
812 	now = _net_uptime;
813 
814 	SK_DF(SK_VERB_IP_FRAG, "run");
815 	lck_mtx_lock(&mgr->ipfm_lock);
816 	q = mgr->ipfm_q.ipfq_next;
817 	if (q) {
818 		while (q != &mgr->ipfm_q) {
819 			q = q->ipfq_next;
820 			elapsed = now - q->ipfq_prev->ipfq_timestamp;
821 			if (elapsed > ipfm_frag_ttl) {
822 				SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
823 				    q->ipfq_prev->ipfq_key.ipfk_ident);
824 				nfreed = ipfq_freefq(mgr, q->ipfq_prev,
825 				    q->ipfq_is_dirty ? NULL :
826 				    ipf_icmp_timeout_err);
827 			}
828 		}
829 	}
830 
831 	/* If running out of resources, drain ipfm queues (oldest one first) */
832 	if (mgr->ipfm_f_count >= mgr->ipfm_f_limit ||
833 	    mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
834 		SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
835 		    mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
836 		    mgr->ipfm_q_limit);
837 		uint32_t target_q_count = mgr->ipfm_q_count / 2;
838 		while (mgr->ipfm_q_count > target_q_count) {
839 			nfreed += ipfq_freefq(mgr, mgr->ipfm_q.ipfq_prev,
840 			    q->ipfq_is_dirty ? NULL : ipf_icmp_timeout_err);
841 		}
842 	}
843 
844 	STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, nfreed);
845 
846 	/* re-arm the purge timer if there's work to do */
847 	if (mgr->ipfm_q_count > 0) {
848 		ipfq_sched_timeout(mgr, TRUE);
849 	}
850 	lck_mtx_unlock(&mgr->ipfm_lock);
851 }
852 
853 static void
ipfq_sched_timeout(struct fsw_ip_frag_mgr * mgr,boolean_t in_tcall)854 ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
855 {
856 	uint32_t delay = MAX(1, ipfm_timeout_tcall_ival);       /* seconds */
857 	thread_call_t tcall = mgr->ipfm_timeout_tcall;
858 	uint64_t now = mach_absolute_time();
859 	uint64_t ival, deadline = now;
860 
861 	LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
862 
863 	ASSERT(tcall != NULL);
864 	if (mgr->ipfm_q_count > 0 &&
865 	    (!thread_call_isactive(tcall) || in_tcall)) {
866 		nanoseconds_to_absolutetime(delay * NSEC_PER_SEC, &ival);
867 		clock_deadline_for_periodic_event(ival, now, &deadline);
868 		(void) thread_call_enter_delayed(tcall, deadline);
869 	}
870 }
871 
872 /*
873  * @internal drain all ressambly queue for shutdown.
874  *
875  * @discussion Shutdown is called when if_detach happens, so no time exceeded
876  * icmp error are generated here.
877  */
878 static void
ipfq_drain(struct fsw_ip_frag_mgr * mgr)879 ipfq_drain(struct fsw_ip_frag_mgr *mgr)
880 {
881 	LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
882 	while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q) {
883 		ipfq_freefq(mgr, mgr->ipfm_q.ipfq_next, NULL);
884 	}
885 }
886 
887 static int
888 ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
889 {
890 #pragma unused(oidp, arg2)
891 	struct fsw_ip_frag_mgr *mgr = arg1;
892 
893 	SKOID_PROC_CALL_GUARD;
894 
895 	lck_mtx_lock(&mgr->ipfm_lock);
896 	ipfq_drain(mgr);
897 	lck_mtx_unlock(&mgr->ipfm_lock);
898 
899 	return 0;
900 }
901 
902 static struct ipfq *
ipfq_alloc(struct fsw_ip_frag_mgr * mgr,int how)903 ipfq_alloc(struct fsw_ip_frag_mgr *mgr, int how)
904 {
905 	struct mbuf *t;
906 	struct ipfq *q;
907 
908 	if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
909 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_QUEUE_LIMIT);
910 		return NULL;
911 	}
912 
913 	t = m_get(how, MT_FTABLE);
914 	if (t != NULL) {
915 		mgr->ipfm_q_count++;
916 		q = mtod(t, struct ipfq *);
917 		bzero(q, sizeof(*q));
918 		q->ipfq_is_dirty = false;
919 	} else {
920 		q = NULL;
921 	}
922 	return q;
923 }
924 
925 /* free q */
926 static void
ipfq_free(struct fsw_ip_frag_mgr * mgr,struct ipfq * q)927 ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q)
928 {
929 	(void) m_free(dtom(q));
930 	mgr->ipfm_q_count--;
931 }
932 
933 /*
934  * Free all fragments, keep q.
935  * @return: number of frags freed
936  */
937 static uint32_t
ipfq_freef(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))938 ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
939     void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
940 {
941 	struct ipf *f, *down6;
942 	uint32_t nfrags = 0;
943 
944 	for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
945 		nfrags++;
946 		down6 = f->ipf_down;
947 		ipf_deq(f);
948 		if (ipf_cb != NULL) {
949 			(*ipf_cb)(mgr, f);
950 		}
951 		ipf_free_pkt(f);
952 		ipf_free(mgr, f);
953 	}
954 
955 	return nfrags;
956 }
957 
958 /* Free both all fragments and q
959  * @return: number of frags freed
960  */
961 static uint32_t
ipfq_freefq(struct fsw_ip_frag_mgr * mgr,struct ipfq * q,void (* ipf_cb)(struct fsw_ip_frag_mgr *,struct ipf *))962 ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
963     void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
964 {
965 	uint32_t freed_count;
966 	freed_count = ipfq_freef(mgr, q, ipf_cb);
967 	ipfq_remque(q);
968 	ipfq_free(mgr, q);
969 	return freed_count;
970 }
971 
972 static struct ipf *
ipf_alloc(struct fsw_ip_frag_mgr * mgr)973 ipf_alloc(struct fsw_ip_frag_mgr *mgr)
974 {
975 	struct mbuf *t;
976 	struct ipf *f;
977 
978 	if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
979 		STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
980 		return NULL;
981 	}
982 
983 	t = m_get(M_DONTWAIT, MT_FTABLE);
984 	if (t != NULL) {
985 		mgr->ipfm_f_count++;
986 		f = mtod(t, struct ipf *);
987 		bzero(f, sizeof(*f));
988 	} else {
989 		f = NULL;
990 	}
991 	return f;
992 }
993 
994 static void
ipf_free_pkt(struct ipf * f)995 ipf_free_pkt(struct ipf *f)
996 {
997 	struct __kern_packet *pkt = f->ipf_pkt;
998 	ASSERT(pkt != NULL);
999 	pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
1000 	    SK_PTR_ADDR(pkt));
1001 }
1002 
1003 static void
ipf_free(struct fsw_ip_frag_mgr * mgr,struct ipf * f)1004 ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
1005 {
1006 	(void) m_free(dtom(f));
1007 	mgr->ipfm_f_count--;
1008 }
1009