1 /*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
31 #include <netinet/tcp.h>
32 #include <netinet/udp.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip6.h>
35
36 #define CL_SKIP_ON(t) \
37 if (__improbable(t)) { \
38 SK_ERR("%d: skip " #t, __LINE__); \
39 SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \
40 pkt_buf + pkt->pkt_headroom, __packet_get_real_data_length(pkt), \
41 MIN(128, bdlen), NULL, 0)); \
42 error = ENOTSUP; \
43 goto done; \
44 }
45
46 #define CL_SKIP_L4() \
47 do { \
48 pkt->pkt_flow_ip_hlen = l3hlen; \
49 pkt->pkt_flow_tcp_src = 0; \
50 pkt->pkt_flow_tcp_dst = 0; \
51 error = 0; \
52 goto done; \
53 } while (0);
54
55 /*
56 * Packet flow parser
57 *
58 * Parse a continuous chunk of packet header fields.
59 *
60 * The idea here is that while we have the headers in the CPU cache,
61 * do as much parsing as necessary and store the results in __flow.
62 *
63 * We assume that outbound packets from the host (BSD) stack never
64 * get here, i.e. we only handle channel-based outbound traffic.
65 *
66 * @param pkt
67 * packet to be classified
68 * @param ifp
69 * associated network interface
70 * @param af
71 * address family
72 * @param input
73 * is it input
74 *
75 * @return
76 * We return ENOTSUP to indicate that we can't classify the packet,
77 * and that the packet should still be forwarded to the lookup path.
78 * Any other non-zero value will cause the packet to be dropped.
79 *
80 */
81 int
flow_pkt_classify(struct __kern_packet * pkt,struct ifnet * ifp,sa_family_t af,bool input)82 flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af,
83 bool input)
84 {
85 #pragma unused(ifp)
86 /* these begin at the same offset in the packet, hence the unions */
87 union {
88 volatile struct ip *__indexable _iph;
89 volatile struct ip6_hdr *__indexable _ip6;
90 } _l3;
91 #define iph _l3._iph
92 #define ip6 _l3._ip6
93 union {
94 volatile struct tcphdr *_tcph;
95 volatile struct udphdr *_udph;
96 } _l4;
97 #define tcph _l4._tcph
98 #define udph _l4._udph
99 uint32_t mtu = ifp->if_mtu;
100
101 size_t pkt_len; /* remaining packet length left for parsing */
102 uint32_t cls_len;
103
104 /*
105 * These are length parsed from packet header, needs to be
106 * incrementally validated from l3 to l4
107 */
108 uint8_t l3hlen = 0; /* IP header length */
109 uint16_t l3tlen = 0; /* total length of IP packet */
110 uint8_t l4hlen = 0; /* TCP/UDP header length */
111 uint16_t ulen = 0; /* user data length */
112
113 int error = 0;
114
115 /* must be 16-bytes aligned due to use of sk_copy* below */
116 _CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0);
117 _CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0);
118 _CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0);
119 _CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0);
120 _CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0);
121 _CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0);
122 _CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0);
123
124 _CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8);
125 _CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) -
126 offsetof(struct __flow_l3_ipv4_addrs, _src)) ==
127 (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src)));
128
129 _CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32);
130 _CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) -
131 offsetof(struct __flow_l3_ipv6_addrs, _src)) ==
132 (offsetof(struct ip6_hdr, ip6_dst) -
133 offsetof(struct ip6_hdr, ip6_src)));
134
135 /* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */
136 _CASSERT(sizeof(struct __flow_l4_tcp) == 16);
137 _CASSERT((offsetof(struct __flow_l4_tcp, _dst) -
138 offsetof(struct __flow_l4_tcp, _src)) ==
139 (offsetof(struct tcphdr, th_dport) -
140 offsetof(struct tcphdr, th_sport)));
141 _CASSERT((offsetof(struct __flow_l4_tcp, _seq) -
142 offsetof(struct __flow_l4_tcp, _src)) ==
143 (offsetof(struct tcphdr, th_seq) -
144 offsetof(struct tcphdr, th_sport)));
145 _CASSERT((offsetof(struct __flow_l4_tcp, _ack) -
146 offsetof(struct __flow_l4_tcp, _src)) ==
147 (offsetof(struct tcphdr, th_ack) -
148 offsetof(struct tcphdr, th_sport)));
149 _CASSERT((offsetof(struct __flow_l4_tcp, _flags) -
150 offsetof(struct __flow_l4_tcp, _src)) ==
151 (offsetof(struct tcphdr, th_flags) -
152 offsetof(struct tcphdr, th_sport)));
153 _CASSERT((offsetof(struct __flow_l4_tcp, _win) -
154 offsetof(struct __flow_l4_tcp, _src)) ==
155 (offsetof(struct tcphdr, th_win) -
156 offsetof(struct tcphdr, th_sport)));
157
158 /* ensure same offsets use for TCP and UDP */
159 _CASSERT(sizeof(struct __flow_l4_udp) == 8);
160 _CASSERT(offsetof(struct __flow, flow_tcp_src) ==
161 offsetof(struct __flow, flow_udp_src));
162 _CASSERT(offsetof(struct __flow, flow_tcp_dst) ==
163 offsetof(struct __flow, flow_udp_dst));
164
165
166 /* parsing starts from l3, count SDU length after l2 header */
167 ASSERT(pkt->pkt_l2_len <= pkt->pkt_length);
168 pkt_len = pkt->pkt_length - pkt->pkt_l2_len;
169
170 /*
171 * we restrict the data length available for classification to the
172 * portion of L3 datagram available in the first buflet.
173 */
174 /*
175 * compat netif sets the packet length and buflet data length
176 * metadata to the original length of the packet although the
177 * actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes).
178 */
179 uint8_t *pkt_buf, *l3_hdr;
180 uint32_t bdlen, bdlim, bdoff;
181 uint32_t pkt_buf_size;
182 uint8_t *__sized_by(pkt_buf_size) pkt_buf_cpy;
183
184 MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
185 pkt_buf_cpy = pkt_buf;
186 pkt_buf_size = bdlim;
187 cls_len = bdlim - bdoff;
188 cls_len -= pkt->pkt_l2_len;
189 cls_len = (uint16_t)MIN(cls_len, pkt_len);
190 VERIFY(pkt_len >= cls_len);
191
192 /* takes care of ip6 assignment too */
193 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
194 pkt_buf_cpy = l3_hdr;
195 pkt_buf_size = cls_len;
196 iph = (volatile struct ip *__indexable)(void *)pkt_buf_cpy;
197
198 VERIFY(af != AF_UNSPEC);
199
200 pkt->pkt_flow_ip_ver = 0;
201
202 /*
203 * This code is in the hot data path, so we try to be as efficient
204 * as possible, and hence the use of unrolled loads/stores.
205 */
206
207 /***************** L3 header (IP/IPv6) *****************/
208 switch (af) {
209 case AF_INET:
210 CL_SKIP_ON(cls_len < sizeof(struct ip));
211 l3hlen = (uint8_t)(iph->ip_hl << 2);
212 CL_SKIP_ON(l3hlen < sizeof(struct ip));
213 CL_SKIP_ON(cls_len < l3hlen);
214
215 /* don't allow outgoing channel-based packet with option(s) */
216 CL_SKIP_ON(!input && l3hlen != sizeof(struct ip));
217
218 l3tlen = ntohs(iph->ip_len);
219
220 CL_SKIP_ON(l3tlen < l3hlen);
221 CL_SKIP_ON(pkt_len < l3tlen);
222 CL_SKIP_ON(iph->ip_v != IPVERSION);
223
224 if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) {
225 sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src),
226 (uint64_t *)(void *)&pkt->pkt_flow_ipv4_src);
227 } else if (IS_P2ALIGNED(&iph->ip_src, 4)) {
228 uint32_t *src;
229 uint32_t *dst;
230
231 src = (uint32_t *)(void *)(__DEVOLATILE(char *, iph) +
232 offsetof(struct ip, ip_src));
233 dst = (uint32_t *__indexable)(&pkt->pkt_flow_ipv4_addrs);
234 sk_copy32_8(src, dst);
235 } else {
236 bcopy(__DECONST(struct __flow_l3_ipv4_addrs *__single, &iph->ip_src),
237 (struct __flow_l3_ipv4_addrs *__single) &pkt->pkt_flow_ipv4_addrs,
238 sizeof(struct __flow_l3_ipv4_addrs));
239 }
240
241 pkt->pkt_flow_ip_ver = IPVERSION;
242 pkt->pkt_flow_ip_proto = iph->ip_p;
243 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph;
244
245 if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) {
246 pkt->pkt_flow_ip_is_frag = TRUE;
247 pkt->pkt_flow_ip_frag_id = iph->ip_id;
248 /* we only parse l4 in the 1st frag */
249 if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) {
250 pkt->pkt_flow_ip_is_first_frag = FALSE;
251 CL_SKIP_L4();
252 } else {
253 pkt->pkt_flow_ip_is_first_frag = TRUE;
254 }
255 }
256 break;
257
258 case AF_INET6:
259 l3hlen = sizeof(struct ip6_hdr);
260 CL_SKIP_ON(cls_len < l3hlen);
261
262 l3tlen = l3hlen + ntohs(ip6->ip6_plen);
263 CL_SKIP_ON(pkt_len < l3tlen);
264 CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION);
265
266 struct ipv6addrs {
267 struct in6_addr src;
268 struct in6_addr dst;
269 };
270
271 if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) {
272 uint64_t *src;
273 uint64_t *dst;
274
275 src = (uint64_t *)(void *)(__DEVOLATILE(char *, ip6) +
276 offsetof(struct ip6_hdr, ip6_src));
277 dst = (uint64_t *__indexable)(void *)(&pkt->pkt_flow_ipv6_addrs);
278 sk_copy64_32(src, dst);
279 } else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) {
280 uint32_t *src;
281 uint32_t *dst;
282
283 src = (uint32_t *)(void *)(__DEVOLATILE(char *, ip6) +
284 offsetof(struct ip6_hdr, ip6_src));
285 dst = (uint32_t *__indexable)(&pkt->pkt_flow_ipv6_addrs);
286 sk_copy32_32(src, dst);
287 } else {
288 bcopy(__DECONST(struct __flow_l3_ipv6_addrs *__single, &ip6->ip6_src),
289 (struct __flow_l3_ipv6_addrs *__single) &pkt->pkt_flow_ipv6_addrs,
290 sizeof(struct __flow_l3_ipv6_addrs));
291 }
292
293 pkt->pkt_flow_ip_ver = IPV6_VERSION;
294 pkt->pkt_flow_ip_proto = ip6->ip6_nxt;
295 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6;
296
297 /* only parse the next immediate extension header for frags */
298 if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) {
299 volatile struct ip6_frag *ip6f;
300 ip6f = (volatile struct ip6_frag *)(ip6 + 1);
301 CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag));
302 pkt->pkt_flow_ip_is_frag = 1;
303 pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident;
304 pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt;
305 l3hlen += sizeof(struct ip6_frag);
306 CL_SKIP_ON(l3tlen < l3hlen);
307 /* we only parse l4 in the 1st frag */
308 if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) {
309 pkt->pkt_flow_ip_is_first_frag = FALSE;
310 CL_SKIP_L4();
311 } else {
312 pkt->pkt_flow_ip_is_first_frag = TRUE;
313 }
314 /* process atomic frag as non-frag */
315 if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
316 pkt->pkt_flow_ip_is_frag = 0;
317 }
318 }
319 break;
320
321 default:
322 error = ENOTSUP;
323 goto done;
324 }
325
326 pkt->pkt_flow_ip_hlen = l3hlen;
327 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP &&
328 pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
329 error = 0;
330 goto done;
331 }
332
333 /**************** L4 header (TCP/UDP) *****************/
334 ulen = (l3tlen - l3hlen);
335 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
336 CL_SKIP_ON((cls_len < l3hlen + sizeof(struct tcphdr)) ||
337 (ulen < sizeof(struct tcphdr)));
338 tcph = __DECONST(volatile struct tcphdr *,
339 (volatile uint8_t *)iph + l3hlen);
340 l4hlen = (uint8_t)(tcph->th_off << 2);
341 CL_SKIP_ON(l4hlen < sizeof(*tcph));
342 CL_SKIP_ON(l4hlen > ulen);
343 pkt->pkt_flow_tcp_hlen = l4hlen;
344 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph;
345 } else {
346 CL_SKIP_ON((cls_len < l3hlen + sizeof(struct udphdr)) ||
347 (ulen < sizeof(struct udphdr)));
348 udph = __DECONST(volatile struct udphdr *,
349 (volatile uint8_t *)iph + l3hlen);
350 l4hlen = sizeof(*udph);
351 CL_SKIP_ON(l4hlen > ulen);
352 pkt->pkt_flow_udp_hlen = l4hlen;
353 pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph;
354 }
355
356 if (__probable(!pkt->pkt_flow_ip_is_frag)) {
357 ulen -= l4hlen;
358 pkt->pkt_flow_ulen = ulen;
359 } else {
360 /*
361 * We can't determine user data length for fragment until
362 * it is reassembled.
363 */
364 pkt->pkt_flow_ulen = 0;
365 }
366
367 if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) {
368 uint32_t *src;
369 uint32_t *dst;
370
371 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
372 src = __unsafe_forge_bidi_indexable(uint32_t *,
373 __DECONST(uint32_t *, &tcph->th_sport),
374 sizeof(uint32_t) * 4);
375 dst = __unsafe_forge_bidi_indexable(uint32_t *,
376 (uint32_t *)(void *)&pkt->pkt_flow_tcp_src,
377 sizeof(uint32_t) * 4);
378 sk_copy32_16(src, dst);
379 } else {
380 src = __unsafe_forge_bidi_indexable(uint32_t *,
381 __DECONST(uint32_t *, &udph->uh_sport),
382 sizeof(uint32_t) * 2);
383 dst = __unsafe_forge_bidi_indexable(uint32_t *,
384 (uint32_t *)(void *) &pkt->pkt_flow_udp_src,
385 sizeof(uint32_t) * 2);
386 sk_copy32_8(src, dst);
387 }
388 } else {
389 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
390 bcopy(__DECONST(struct __flow_l4_tcp *__single, tcph),
391 (struct __flow_l4_tcp *__single) &pkt->pkt_flow_tcp,
392 sizeof(struct __flow_l4_tcp));
393 } else {
394 bcopy(__DECONST(struct __flow_l4_udp *__single, udph),
395 (struct __flow_l4_udp *__single) &pkt->pkt_flow_udp,
396 sizeof(struct __flow_l4_udp));
397 }
398 }
399
400 if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
401 pkt->pkt_flow_ulen != 0) {
402 /*
403 * Following the logic in tcp_output(), we mark
404 * this if the payload is non-zero; note that
405 * the pkt_flow_tcp_seq is in network byte order.
406 */
407 pkt->pkt_pflags |= PKT_F_START_SEQ;
408 }
409 done:
410 if (__probable(error == 0)) {
411 SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u "
412 "l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u",
413 pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver,
414 pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src),
415 ntohs(pkt->pkt_flow_tcp_dst));
416 /* on output, trim metadata length if not same as IP length */
417 if (!input) {
418 if (__improbable(pkt->pkt_length != (l3tlen + pkt->pkt_l2_len))) {
419 SK_ERR("packet is too long (%u), trimming to "
420 "IP + L2 length (%d)", pkt->pkt_length,
421 l3tlen + pkt->pkt_l2_len);
422 METADATA_SET_LEN(pkt, l3tlen + pkt->pkt_l2_len, bdoff);
423 }
424 if (__improbable(((pkt->pkt_length > mtu) &&
425 (pkt->pkt_proto_seg_sz == 0)) ||
426 (pkt->pkt_proto_seg_sz > mtu))) {
427 SK_ERR("dropped; length (%u) exceeds MTU (%d) "
428 " proto_seg_sz %d",
429 pkt->pkt_length, mtu,
430 pkt->pkt_proto_seg_sz);
431 SK_ERR("%s", sk_dump("buf", l3_hdr, cls_len,
432 128, NULL, 0));
433 error = EMSGSIZE;
434 goto fail;
435 }
436 }
437 /*
438 * Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate
439 * that the __flow structure has valid info now.
440 */
441 pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED;
442 return 0;
443 }
444
445 fail:
446 ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED));
447 KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow);
448
449 return error;
450 }
451