xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_classifier.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
31 #include <netinet/tcp.h>
32 #include <netinet/udp.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip6.h>
35 
36 #define CL_SKIP_ON(t)                           \
37 	if (__improbable(t)) {                  \
38 	        SK_ERR("%d: skip " #t, __LINE__); \
39 	        SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \
40 	            pkt_buf + pkt->pkt_headroom, pkt->pkt_length, \
41 	            MIN(128, bdlen), NULL, 0)); \
42 	        error = ENOTSUP;                \
43 	        goto done;                      \
44 	}
45 
46 #define CL_SKIP_L4()                            \
47 	do {                                    \
48 	        pkt->pkt_flow_ip_hlen = l3hlen; \
49 	        pkt->pkt_flow_tcp_src = 0;      \
50 	        pkt->pkt_flow_tcp_dst = 0;      \
51 	        error = 0;                      \
52 	        goto done;                      \
53 	} while (0);
54 
55 /*
56  * Packet flow parser
57  *
58  * Parse a continuous chunk of packet header fields.
59  *
60  * The idea here is that while we have the headers in the CPU cache,
61  * do as much parsing as necessary and store the results in __flow.
62  *
63  * We assume that outbound packets from the host (BSD) stack never
64  * get here, i.e. we only handle channel-based outbound traffic.
65  *
66  * @param pkt
67  *   packet to be classified
68  * @param ifp
69  *   associated network interface
70  * @param af
71  *   address family
72  * @param input
73  *   is it input
74  *
75  * @return
76  * We return ENOTSUP to indicate that we can't classify the packet,
77  * and that the packet should still be forwarded to the lookup path.
78  * Any other non-zero value will cause the packet to be dropped.
79  *
80  */
81 int
flow_pkt_classify(struct __kern_packet * pkt,struct ifnet * ifp,sa_family_t af,bool input)82 flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af,
83     bool input)
84 {
85 	/* these begin at the same offset in the packet, hence the unions */
86 	union {
87 		volatile struct ip *_iph;
88 		volatile struct ip6_hdr *_ip6;
89 	} _l3;
90 #define iph _l3._iph
91 #define ip6 _l3._ip6
92 	union {
93 		volatile struct tcphdr *_tcph;
94 		volatile struct udphdr *_udph;
95 	} _l4;
96 #define tcph _l4._tcph
97 #define udph _l4._udph
98 	uint32_t mtu = ifp->if_mtu;
99 
100 	size_t pkt_len;       /* remaining packet length left for parsing */
101 	uint16_t cls_len;
102 
103 	/*
104 	 * These are length parsed from packet header, needs to be
105 	 * incrementally validated from l3 to l4
106 	 */
107 	uint8_t l3hlen = 0;    /* IP header length */
108 	uint16_t l3tlen = 0;    /* total length of IP packet */
109 	uint8_t l4hlen = 0;    /* TCP/UDP header length */
110 	uint16_t ulen = 0;      /* user data length */
111 
112 	int error = 0;
113 
114 	/* must be 16-bytes aligned due to use of sk_copy* below */
115 	_CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0);
116 	_CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0);
117 	_CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0);
118 	_CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0);
119 	_CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0);
120 	_CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0);
121 	_CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0);
122 
123 	_CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8);
124 	_CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) -
125 	    offsetof(struct __flow_l3_ipv4_addrs, _src)) ==
126 	    (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src)));
127 
128 	_CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32);
129 	_CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) -
130 	    offsetof(struct __flow_l3_ipv6_addrs, _src)) ==
131 	    (offsetof(struct ip6_hdr, ip6_dst) -
132 	    offsetof(struct ip6_hdr, ip6_src)));
133 
134 	/* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */
135 	_CASSERT(sizeof(struct __flow_l4_tcp) == 16);
136 	_CASSERT((offsetof(struct __flow_l4_tcp, _dst) -
137 	    offsetof(struct __flow_l4_tcp, _src)) ==
138 	    (offsetof(struct tcphdr, th_dport) -
139 	    offsetof(struct tcphdr, th_sport)));
140 	_CASSERT((offsetof(struct __flow_l4_tcp, _seq) -
141 	    offsetof(struct __flow_l4_tcp, _src)) ==
142 	    (offsetof(struct tcphdr, th_seq) -
143 	    offsetof(struct tcphdr, th_sport)));
144 	_CASSERT((offsetof(struct __flow_l4_tcp, _ack) -
145 	    offsetof(struct __flow_l4_tcp, _src)) ==
146 	    (offsetof(struct tcphdr, th_ack) -
147 	    offsetof(struct tcphdr, th_sport)));
148 	_CASSERT((offsetof(struct __flow_l4_tcp, _flags) -
149 	    offsetof(struct __flow_l4_tcp, _src)) ==
150 	    (offsetof(struct tcphdr, th_flags) -
151 	    offsetof(struct tcphdr, th_sport)));
152 	_CASSERT((offsetof(struct __flow_l4_tcp, _win) -
153 	    offsetof(struct __flow_l4_tcp, _src)) ==
154 	    (offsetof(struct tcphdr, th_win) -
155 	    offsetof(struct tcphdr, th_sport)));
156 
157 	/* ensure same offsets use for TCP and UDP */
158 	_CASSERT(sizeof(struct __flow_l4_udp) == 8);
159 	_CASSERT(offsetof(struct __flow, flow_tcp_src) ==
160 	    offsetof(struct __flow, flow_udp_src));
161 	_CASSERT(offsetof(struct __flow, flow_tcp_dst) ==
162 	    offsetof(struct __flow, flow_udp_dst));
163 
164 
165 	/* parsing starts from l3, count SDU length after l2 header */
166 	ASSERT(pkt->pkt_l2_len <= pkt->pkt_length);
167 	pkt_len = pkt->pkt_length - pkt->pkt_l2_len;
168 
169 	/*
170 	 * we restrict the data length available for classification to the
171 	 * portion of L3 datagram available in the first buflet.
172 	 */
173 	/*
174 	 * compat netif sets the packet length and buflet data length
175 	 * metadata to the original length of the packet although the
176 	 * actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes).
177 	 */
178 	uint8_t *pkt_buf, *l3_hdr;
179 	uint16_t bdlen, bdlim, bdoff;
180 
181 	MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
182 	cls_len = bdlim - bdoff;
183 	cls_len -= pkt->pkt_l2_len;
184 	cls_len = (uint16_t)MIN(cls_len, pkt_len);
185 	VERIFY(pkt_len >= cls_len);
186 
187 	/* takes care of ip6 assignment too */
188 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
189 	iph = (volatile struct ip *)(void *)l3_hdr;
190 
191 	VERIFY(af != AF_UNSPEC);
192 
193 	pkt->pkt_flow_ip_ver = 0;
194 
195 	/*
196 	 * This code is in the hot data path, so we try to be as efficient
197 	 * as possible, and hence the use of unrolled loads/stores.
198 	 */
199 
200 	/***************** L3 header (IP/IPv6) *****************/
201 	switch (af) {
202 	case AF_INET:
203 		CL_SKIP_ON(cls_len < sizeof(struct ip));
204 		l3hlen = (uint8_t)(iph->ip_hl << 2);
205 		CL_SKIP_ON(l3hlen < sizeof(struct ip));
206 		CL_SKIP_ON(cls_len < l3hlen);
207 
208 		/* don't allow outgoing channel-based packet with option(s) */
209 		CL_SKIP_ON(!input && l3hlen != sizeof(struct ip));
210 
211 		l3tlen = ntohs(iph->ip_len);
212 
213 		CL_SKIP_ON(l3tlen < l3hlen);
214 		CL_SKIP_ON(pkt_len < l3tlen);
215 		CL_SKIP_ON(iph->ip_v != IPVERSION);
216 
217 		if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) {
218 			sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src),
219 			    (uint64_t *)(void *)&pkt->pkt_flow_ipv4_src);
220 		} else if (IS_P2ALIGNED(&iph->ip_src, 4)) {
221 			sk_copy32_8(__DECONST(uint32_t *, &iph->ip_src),
222 			    (uint32_t *)(void *)&pkt->pkt_flow_ipv4_src);
223 		} else {
224 			bcopy(__DECONST(void *, &iph->ip_src),
225 			    (void *)&pkt->pkt_flow_ipv4_addrs,
226 			    sizeof(struct __flow_l3_ipv4_addrs));
227 		}
228 
229 		pkt->pkt_flow_ip_ver = IPVERSION;
230 		pkt->pkt_flow_ip_proto = iph->ip_p;
231 		pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph;
232 
233 		if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) {
234 			pkt->pkt_flow_ip_is_frag = TRUE;
235 			pkt->pkt_flow_ip_frag_id = iph->ip_id;
236 			/* we only parse l4 in the 1st frag */
237 			if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) {
238 				pkt->pkt_flow_ip_is_first_frag = FALSE;
239 				CL_SKIP_L4();
240 			} else {
241 				pkt->pkt_flow_ip_is_first_frag = TRUE;
242 			}
243 		}
244 		break;
245 
246 	case AF_INET6:
247 		l3hlen = sizeof(struct ip6_hdr);
248 		CL_SKIP_ON(cls_len < l3hlen);
249 
250 		l3tlen = l3hlen + ntohs(ip6->ip6_plen);
251 		CL_SKIP_ON(pkt_len < l3tlen);
252 		CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION);
253 
254 		if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) {
255 			sk_copy64_32(__DECONST(uint64_t *, &ip6->ip6_src),
256 			    (uint64_t *)(void *)&pkt->pkt_flow_ipv6_src);
257 		} else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) {
258 			sk_copy32_32(__DECONST(uint32_t *, &ip6->ip6_src),
259 			    (uint32_t *)(void *)&pkt->pkt_flow_ipv6_src);
260 		} else {
261 			bcopy(__DECONST(void *, &ip6->ip6_src),
262 			    (void *)&pkt->pkt_flow_ipv6_addrs,
263 			    sizeof(struct __flow_l3_ipv6_addrs));
264 		}
265 
266 		pkt->pkt_flow_ip_ver = IPV6_VERSION;
267 		pkt->pkt_flow_ip_proto = ip6->ip6_nxt;
268 		pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6;
269 
270 		/* only parse the next immediate extension header for frags */
271 		if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) {
272 			volatile struct ip6_frag *ip6f;
273 			ip6f = (volatile struct ip6_frag *)(ip6 + 1);
274 			CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag));
275 			pkt->pkt_flow_ip_is_frag = 1;
276 			pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident;
277 			pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt;
278 			l3hlen += sizeof(struct ip6_frag);
279 			CL_SKIP_ON(l3tlen < l3hlen);
280 			/* we only parse l4 in the 1st frag */
281 			if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) {
282 				pkt->pkt_flow_ip_is_first_frag = FALSE;
283 				CL_SKIP_L4();
284 			} else {
285 				pkt->pkt_flow_ip_is_first_frag = TRUE;
286 			}
287 			/* process atomic frag as non-frag */
288 			if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
289 				pkt->pkt_flow_ip_is_frag = 0;
290 			}
291 		}
292 		break;
293 
294 	default:
295 		error = ENOTSUP;
296 		goto done;
297 	}
298 
299 	pkt->pkt_flow_ip_hlen = l3hlen;
300 	if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP &&
301 	    pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
302 		error = 0;
303 		goto done;
304 	}
305 
306 	/**************** L4 header (TCP/UDP) *****************/
307 
308 	/* this takes care of UDP header as well (see l4 union var) */
309 	tcph = __DECONST(volatile struct tcphdr *,
310 	    (volatile uint8_t *)iph + l3hlen);
311 	ulen = (l3tlen - l3hlen);
312 	if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
313 		CL_SKIP_ON((cls_len < l3hlen + sizeof(*tcph)) ||
314 		    (ulen < sizeof(*tcph)));
315 		l4hlen = (uint8_t)(tcph->th_off << 2);
316 		CL_SKIP_ON(l4hlen < sizeof(*tcph));
317 		CL_SKIP_ON(l4hlen > ulen);
318 		pkt->pkt_flow_tcp_hlen = l4hlen;
319 		pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph;
320 	} else {
321 		CL_SKIP_ON((cls_len < l3hlen + sizeof(*udph)) ||
322 		    (ulen < sizeof(*udph)));
323 		l4hlen = sizeof(*udph);
324 		CL_SKIP_ON(l4hlen > ulen);
325 		pkt->pkt_flow_udp_hlen = l4hlen;
326 		pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph;
327 	}
328 
329 	if (__probable(!pkt->pkt_flow_ip_is_frag)) {
330 		ulen -= l4hlen;
331 		pkt->pkt_flow_ulen = ulen;
332 	} else {
333 		/*
334 		 * We can't determine user data length for fragment until
335 		 * it is reassembled.
336 		 */
337 		pkt->pkt_flow_ulen = 0;
338 	}
339 
340 	if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) {
341 		if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
342 			sk_copy32_16(__DECONST(uint32_t *, &tcph->th_sport),
343 			    (uint32_t *)(void *)&pkt->pkt_flow_tcp_src);
344 		} else {
345 			sk_copy32_8(__DECONST(uint32_t *, &udph->uh_sport),
346 			    (uint32_t *)(void *)&pkt->pkt_flow_udp_src);
347 		}
348 	} else {
349 		if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
350 			bcopy(__DECONST(void *, &tcph->th_sport),
351 			    (void *)&pkt->pkt_flow_tcp,
352 			    sizeof(struct __flow_l4_tcp));
353 		} else {
354 			bcopy(__DECONST(void *, &udph->uh_sport),
355 			    (void *)&pkt->pkt_flow_udp,
356 			    sizeof(struct __flow_l4_udp));
357 		}
358 	}
359 
360 	if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
361 	    pkt->pkt_flow_ulen != 0) {
362 		/*
363 		 * Following the logic in tcp_output(), we mark
364 		 * this if the payload is non-zero; note that
365 		 * the pkt_flow_tcp_seq is in network byte order.
366 		 */
367 		pkt->pkt_pflags |= PKT_F_START_SEQ;
368 	}
369 done:
370 	if (__probable(error == 0)) {
371 		SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u "
372 		    "l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u",
373 		    pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver,
374 		    pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src),
375 		    ntohs(pkt->pkt_flow_tcp_dst));
376 		/* on output, trim metadata length if not same as IP length */
377 		if (!input) {
378 			if (__improbable(pkt->pkt_length != l3tlen)) {
379 				SK_ERR("packet is too long (%u), trimming to "
380 				    "IP length (%d)", pkt->pkt_length, l3tlen);
381 				METADATA_SET_LEN(pkt, l3tlen, bdoff);
382 			}
383 			if (__improbable(pkt->pkt_length > mtu)) {
384 				SK_ERR("dropped; length (%u) exceeds MTU (%d)",
385 				    pkt->pkt_length, mtu);
386 				SK_ERR("%s", sk_dump("buf", l3_hdr, cls_len,
387 				    128, NULL, 0));
388 				error = EMSGSIZE;
389 				goto fail;
390 			}
391 		}
392 		/*
393 		 * Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate
394 		 * that the __flow structure has valid info now.
395 		 */
396 		pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED;
397 		return 0;
398 	}
399 
400 fail:
401 	ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED));
402 	KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow);
403 
404 	return error;
405 }
406