1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
31 #include <netinet/tcp.h>
32 #include <netinet/udp.h>
33 #include <netinet/ip.h>
34 #include <netinet/ip6.h>
35
36 #define CL_SKIP_ON(t) \
37 if (__improbable(t)) { \
38 SK_ERR("%d: skip " #t, __LINE__); \
39 SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \
40 pkt_buf + pkt->pkt_headroom, pkt->pkt_length, \
41 MIN(128, bdlen), NULL, 0)); \
42 error = ENOTSUP; \
43 goto done; \
44 }
45
46 #define CL_SKIP_L4() \
47 do { \
48 pkt->pkt_flow_ip_hlen = l3hlen; \
49 pkt->pkt_flow_tcp_src = 0; \
50 pkt->pkt_flow_tcp_dst = 0; \
51 error = 0; \
52 goto done; \
53 } while (0);
54
55 /*
56 * Packet flow parser
57 *
58 * Parse a continuous chunk of packet header fields.
59 *
60 * The idea here is that while we have the headers in the CPU cache,
61 * do as much parsing as necessary and store the results in __flow.
62 *
63 * We assume that outbound packets from the host (BSD) stack never
64 * get here, i.e. we only handle channel-based outbound traffic.
65 *
66 * @param pkt
67 * packet to be classified
68 * @param ifp
69 * associated network interface
70 * @param af
71 * address family
72 * @param input
73 * is it input
74 *
75 * @return
76 * We return ENOTSUP to indicate that we can't classify the packet,
77 * and that the packet should still be forwarded to the lookup path.
78 * Any other non-zero value will cause the packet to be dropped.
79 *
80 */
81 int
flow_pkt_classify(struct __kern_packet * pkt,struct ifnet * ifp,sa_family_t af,bool input)82 flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af,
83 bool input)
84 {
85 /* these begin at the same offset in the packet, hence the unions */
86 union {
87 volatile struct ip *_iph;
88 volatile struct ip6_hdr *_ip6;
89 } _l3;
90 #define iph _l3._iph
91 #define ip6 _l3._ip6
92 union {
93 volatile struct tcphdr *_tcph;
94 volatile struct udphdr *_udph;
95 } _l4;
96 #define tcph _l4._tcph
97 #define udph _l4._udph
98 uint32_t mtu = ifp->if_mtu;
99
100 size_t pkt_len; /* remaining packet length left for parsing */
101 uint16_t cls_len;
102
103 /*
104 * These are length parsed from packet header, needs to be
105 * incrementally validated from l3 to l4
106 */
107 uint8_t l3hlen = 0; /* IP header length */
108 uint16_t l3tlen = 0; /* total length of IP packet */
109 uint8_t l4hlen = 0; /* TCP/UDP header length */
110 uint16_t ulen = 0; /* user data length */
111
112 int error = 0;
113
114 /* must be 16-bytes aligned due to use of sk_copy* below */
115 _CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0);
116 _CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0);
117 _CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0);
118 _CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0);
119 _CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0);
120 _CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0);
121 _CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0);
122
123 _CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8);
124 _CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) -
125 offsetof(struct __flow_l3_ipv4_addrs, _src)) ==
126 (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src)));
127
128 _CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32);
129 _CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) -
130 offsetof(struct __flow_l3_ipv6_addrs, _src)) ==
131 (offsetof(struct ip6_hdr, ip6_dst) -
132 offsetof(struct ip6_hdr, ip6_src)));
133
134 /* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */
135 _CASSERT(sizeof(struct __flow_l4_tcp) == 16);
136 _CASSERT((offsetof(struct __flow_l4_tcp, _dst) -
137 offsetof(struct __flow_l4_tcp, _src)) ==
138 (offsetof(struct tcphdr, th_dport) -
139 offsetof(struct tcphdr, th_sport)));
140 _CASSERT((offsetof(struct __flow_l4_tcp, _seq) -
141 offsetof(struct __flow_l4_tcp, _src)) ==
142 (offsetof(struct tcphdr, th_seq) -
143 offsetof(struct tcphdr, th_sport)));
144 _CASSERT((offsetof(struct __flow_l4_tcp, _ack) -
145 offsetof(struct __flow_l4_tcp, _src)) ==
146 (offsetof(struct tcphdr, th_ack) -
147 offsetof(struct tcphdr, th_sport)));
148 _CASSERT((offsetof(struct __flow_l4_tcp, _flags) -
149 offsetof(struct __flow_l4_tcp, _src)) ==
150 (offsetof(struct tcphdr, th_flags) -
151 offsetof(struct tcphdr, th_sport)));
152 _CASSERT((offsetof(struct __flow_l4_tcp, _win) -
153 offsetof(struct __flow_l4_tcp, _src)) ==
154 (offsetof(struct tcphdr, th_win) -
155 offsetof(struct tcphdr, th_sport)));
156
157 /* ensure same offsets use for TCP and UDP */
158 _CASSERT(sizeof(struct __flow_l4_udp) == 8);
159 _CASSERT(offsetof(struct __flow, flow_tcp_src) ==
160 offsetof(struct __flow, flow_udp_src));
161 _CASSERT(offsetof(struct __flow, flow_tcp_dst) ==
162 offsetof(struct __flow, flow_udp_dst));
163
164
165 /* parsing starts from l3, count SDU length after l2 header */
166 ASSERT(pkt->pkt_l2_len <= pkt->pkt_length);
167 pkt_len = pkt->pkt_length - pkt->pkt_l2_len;
168
169 /*
170 * we restrict the data length available for classification to the
171 * portion of L3 datagram available in the first buflet.
172 */
173 /*
174 * compat netif sets the packet length and buflet data length
175 * metadata to the original length of the packet although the
176 * actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes).
177 */
178 uint8_t *pkt_buf, *l3_hdr;
179 uint16_t bdlen, bdlim, bdoff;
180
181 MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
182 cls_len = bdlim - bdoff;
183 cls_len -= pkt->pkt_l2_len;
184 cls_len = (uint16_t)MIN(cls_len, pkt_len);
185 VERIFY(pkt_len >= cls_len);
186
187 /* takes care of ip6 assignment too */
188 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
189 iph = (volatile struct ip *)(void *)l3_hdr;
190
191 VERIFY(af != AF_UNSPEC);
192
193 pkt->pkt_flow_ip_ver = 0;
194
195 /*
196 * This code is in the hot data path, so we try to be as efficient
197 * as possible, and hence the use of unrolled loads/stores.
198 */
199
200 /***************** L3 header (IP/IPv6) *****************/
201 switch (af) {
202 case AF_INET:
203 CL_SKIP_ON(cls_len < sizeof(struct ip));
204 l3hlen = (uint8_t)(iph->ip_hl << 2);
205 CL_SKIP_ON(l3hlen < sizeof(struct ip));
206 CL_SKIP_ON(cls_len < l3hlen);
207
208 /* don't allow outgoing channel-based packet with option(s) */
209 CL_SKIP_ON(!input && l3hlen != sizeof(struct ip));
210
211 l3tlen = ntohs(iph->ip_len);
212
213 CL_SKIP_ON(l3tlen < l3hlen);
214 CL_SKIP_ON(pkt_len < l3tlen);
215 CL_SKIP_ON(iph->ip_v != IPVERSION);
216
217 if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) {
218 sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src),
219 (uint64_t *)(void *)&pkt->pkt_flow_ipv4_src);
220 } else if (IS_P2ALIGNED(&iph->ip_src, 4)) {
221 sk_copy32_8(__DECONST(uint32_t *, &iph->ip_src),
222 (uint32_t *)(void *)&pkt->pkt_flow_ipv4_src);
223 } else {
224 bcopy(__DECONST(void *, &iph->ip_src),
225 (void *)&pkt->pkt_flow_ipv4_addrs,
226 sizeof(struct __flow_l3_ipv4_addrs));
227 }
228
229 pkt->pkt_flow_ip_ver = IPVERSION;
230 pkt->pkt_flow_ip_proto = iph->ip_p;
231 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph;
232
233 if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) {
234 pkt->pkt_flow_ip_is_frag = TRUE;
235 pkt->pkt_flow_ip_frag_id = iph->ip_id;
236 /* we only parse l4 in the 1st frag */
237 if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) {
238 pkt->pkt_flow_ip_is_first_frag = FALSE;
239 CL_SKIP_L4();
240 } else {
241 pkt->pkt_flow_ip_is_first_frag = TRUE;
242 }
243 }
244 break;
245
246 case AF_INET6:
247 l3hlen = sizeof(struct ip6_hdr);
248 CL_SKIP_ON(cls_len < l3hlen);
249
250 l3tlen = l3hlen + ntohs(ip6->ip6_plen);
251 CL_SKIP_ON(pkt_len < l3tlen);
252 CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION);
253
254 if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) {
255 sk_copy64_32(__DECONST(uint64_t *, &ip6->ip6_src),
256 (uint64_t *)(void *)&pkt->pkt_flow_ipv6_src);
257 } else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) {
258 sk_copy32_32(__DECONST(uint32_t *, &ip6->ip6_src),
259 (uint32_t *)(void *)&pkt->pkt_flow_ipv6_src);
260 } else {
261 bcopy(__DECONST(void *, &ip6->ip6_src),
262 (void *)&pkt->pkt_flow_ipv6_addrs,
263 sizeof(struct __flow_l3_ipv6_addrs));
264 }
265
266 pkt->pkt_flow_ip_ver = IPV6_VERSION;
267 pkt->pkt_flow_ip_proto = ip6->ip6_nxt;
268 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6;
269
270 /* only parse the next immediate extension header for frags */
271 if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) {
272 volatile struct ip6_frag *ip6f;
273 ip6f = (volatile struct ip6_frag *)(ip6 + 1);
274 CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag));
275 pkt->pkt_flow_ip_is_frag = 1;
276 pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident;
277 pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt;
278 l3hlen += sizeof(struct ip6_frag);
279 CL_SKIP_ON(l3tlen < l3hlen);
280 /* we only parse l4 in the 1st frag */
281 if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) {
282 pkt->pkt_flow_ip_is_first_frag = FALSE;
283 CL_SKIP_L4();
284 } else {
285 pkt->pkt_flow_ip_is_first_frag = TRUE;
286 }
287 /* process atomic frag as non-frag */
288 if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
289 pkt->pkt_flow_ip_is_frag = 0;
290 }
291 }
292 break;
293
294 default:
295 error = ENOTSUP;
296 goto done;
297 }
298
299 pkt->pkt_flow_ip_hlen = l3hlen;
300 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP &&
301 pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
302 error = 0;
303 goto done;
304 }
305
306 /**************** L4 header (TCP/UDP) *****************/
307
308 /* this takes care of UDP header as well (see l4 union var) */
309 tcph = __DECONST(volatile struct tcphdr *,
310 (volatile uint8_t *)iph + l3hlen);
311 ulen = (l3tlen - l3hlen);
312 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
313 CL_SKIP_ON((cls_len < l3hlen + sizeof(*tcph)) ||
314 (ulen < sizeof(*tcph)));
315 l4hlen = (uint8_t)(tcph->th_off << 2);
316 CL_SKIP_ON(l4hlen < sizeof(*tcph));
317 CL_SKIP_ON(l4hlen > ulen);
318 pkt->pkt_flow_tcp_hlen = l4hlen;
319 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph;
320 } else {
321 CL_SKIP_ON((cls_len < l3hlen + sizeof(*udph)) ||
322 (ulen < sizeof(*udph)));
323 l4hlen = sizeof(*udph);
324 CL_SKIP_ON(l4hlen > ulen);
325 pkt->pkt_flow_udp_hlen = l4hlen;
326 pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph;
327 }
328
329 if (__probable(!pkt->pkt_flow_ip_is_frag)) {
330 ulen -= l4hlen;
331 pkt->pkt_flow_ulen = ulen;
332 } else {
333 /*
334 * We can't determine user data length for fragment until
335 * it is reassembled.
336 */
337 pkt->pkt_flow_ulen = 0;
338 }
339
340 if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) {
341 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
342 sk_copy32_16(__DECONST(uint32_t *, &tcph->th_sport),
343 (uint32_t *)(void *)&pkt->pkt_flow_tcp_src);
344 } else {
345 sk_copy32_8(__DECONST(uint32_t *, &udph->uh_sport),
346 (uint32_t *)(void *)&pkt->pkt_flow_udp_src);
347 }
348 } else {
349 if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
350 bcopy(__DECONST(void *, &tcph->th_sport),
351 (void *)&pkt->pkt_flow_tcp,
352 sizeof(struct __flow_l4_tcp));
353 } else {
354 bcopy(__DECONST(void *, &udph->uh_sport),
355 (void *)&pkt->pkt_flow_udp,
356 sizeof(struct __flow_l4_udp));
357 }
358 }
359
360 if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
361 pkt->pkt_flow_ulen != 0) {
362 /*
363 * Following the logic in tcp_output(), we mark
364 * this if the payload is non-zero; note that
365 * the pkt_flow_tcp_seq is in network byte order.
366 */
367 pkt->pkt_pflags |= PKT_F_START_SEQ;
368 }
369 done:
370 if (__probable(error == 0)) {
371 SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u "
372 "l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u",
373 pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver,
374 pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src),
375 ntohs(pkt->pkt_flow_tcp_dst));
376 /* on output, trim metadata length if not same as IP length */
377 if (!input) {
378 if (__improbable(pkt->pkt_length != l3tlen)) {
379 SK_ERR("packet is too long (%u), trimming to "
380 "IP length (%d)", pkt->pkt_length, l3tlen);
381 METADATA_SET_LEN(pkt, l3tlen, bdoff);
382 }
383 if (__improbable(pkt->pkt_length > mtu)) {
384 SK_ERR("dropped; length (%u) exceeds MTU (%d)",
385 pkt->pkt_length, mtu);
386 SK_ERR("%s", sk_dump("buf", l3_hdr, cls_len,
387 128, NULL, 0));
388 error = EMSGSIZE;
389 goto fail;
390 }
391 }
392 /*
393 * Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate
394 * that the __flow structure has valid info now.
395 */
396 pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED;
397 return 0;
398 }
399
400 fail:
401 ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED));
402 KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow);
403
404 return error;
405 }
406