1 /*
2 * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <machine/endian.h>
31 #include <net/necp.h>
32
33 uint32_t copy_pkt_tx_time = 1;
34 #if (DEVELOPMENT || DEBUG)
35 SYSCTL_NODE(_kern_skywalk, OID_AUTO, packet,
36 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk packet");
37 int pkt_trailers = 0; /* for testing trailing bytes */
38 SYSCTL_INT(_kern_skywalk_packet, OID_AUTO, trailers,
39 CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_trailers, 0, "");
40
41 SYSCTL_UINT(_kern_skywalk_packet, OID_AUTO, copy_pkt_tx_time,
42 CTLFLAG_RW | CTLFLAG_LOCKED, ©_pkt_tx_time, 0,
43 "copy tx time from pkt to mbuf");
44 #endif /* !DEVELOPMENT && !DEBUG */
45
46
47 __attribute__((always_inline))
48 static inline void
_pkt_copy(void * __sized_by (len)src,void * __sized_by (len)dst,size_t len)49 _pkt_copy(void *__sized_by(len)src, void *__sized_by(len)dst, size_t len)
50 {
51 if (__probable(IS_P2ALIGNED(src, 8) && IS_P2ALIGNED(dst, 8))) {
52 switch (len) {
53 case 20: /* standard IPv4 header */
54 sk_copy64_20(src, dst);
55 return;
56
57 case 40: /* IPv6 header */
58 sk_copy64_40(src, dst);
59 return;
60
61 default:
62 if (IS_P2ALIGNED(len, 64)) {
63 sk_copy64_64x(src, dst, len);
64 return;
65 } else if (IS_P2ALIGNED(len, 32)) {
66 sk_copy64_32x(src, dst, len);
67 return;
68 } else if (IS_P2ALIGNED(len, 8)) {
69 sk_copy64_8x(src, dst, len);
70 return;
71 } else if (IS_P2ALIGNED(len, 4)) {
72 sk_copy64_4x(src, dst, len);
73 return;
74 }
75 break;
76 }
77 }
78 bcopy(src, dst, len);
79 }
80
81 /*
82 * This routine is used for copying data across two kernel packets.
83 * Can also optionally compute 16-bit partial inet checksum as the
84 * data is copied.
85 * This routine is used by flowswitch while copying packet from vp
86 * adapter pool to packet in native netif pool and vice-a-versa.
87 *
88 * start/stuff is relative to soff, within [0, len], such that
89 * [ 0 ... soff ... soff + start/stuff ... soff + len ... ]
90 */
91 void
pkt_copy_from_pkt(const enum txrx t,kern_packet_t dph,const uint16_t doff,kern_packet_t sph,const uint16_t soff,const uint32_t len,const boolean_t copysum,const uint16_t start,const uint16_t stuff,const boolean_t invert)92 pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff,
93 kern_packet_t sph, const uint16_t soff, const uint32_t len,
94 const boolean_t copysum, const uint16_t start, const uint16_t stuff,
95 const boolean_t invert)
96 {
97 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
98 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
99 uint32_t partial;
100 uint16_t csum = 0;
101 uint8_t *sbaddr, *dbaddr;
102 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
103
104 _CASSERT(sizeof(csum) == sizeof(uint16_t));
105
106 /* get buffer address from packet */
107 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
108 ASSERT(sbaddr != NULL);
109 sbaddr += soff;
110 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
111 ASSERT(dbaddr != NULL);
112 dbaddr += doff;
113 VERIFY((doff + len) <= PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
114
115 switch (t) {
116 case NR_RX:
117 dpkt->pkt_csum_flags = 0;
118 if (__probable(do_sum)) {
119 /*
120 * Use pkt_copy() to copy the portion up to the
121 * point where we need to start the checksum, and
122 * copy the remainder, checksumming as we go.
123 */
124 if (__probable(start != 0)) {
125 _pkt_copy(sbaddr, dbaddr, start);
126 }
127 partial = __packet_copy_and_sum((sbaddr + start),
128 (dbaddr + start), (len - start), 0);
129 csum = __packet_fold_sum(partial);
130
131 __packet_set_inet_checksum(dph, PACKET_CSUM_PARTIAL,
132 start, csum, FALSE);
133 } else {
134 _pkt_copy(sbaddr, dbaddr, len);
135 dpkt->pkt_csum_rx_start_off = spkt->pkt_csum_rx_start_off;
136 dpkt->pkt_csum_rx_value = spkt->pkt_csum_rx_value;
137 dpkt->pkt_csum_flags |= spkt->pkt_csum_flags & PACKET_CSUM_RX_FLAGS;
138 }
139
140 SK_DF(SK_VERB_COPY | SK_VERB_RX,
141 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
142 sk_proc_name_address(current_proc()),
143 sk_proc_pid(current_proc()), len,
144 (copysum ? (len - start) : 0), csum, start);
145 SK_DF(SK_VERB_COPY | SK_VERB_RX,
146 " pkt 0x%llx doff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
147 SK_KVA(dpkt), doff, dpkt->pkt_csum_flags,
148 (uint32_t)dpkt->pkt_csum_rx_start_off,
149 (uint32_t)dpkt->pkt_csum_rx_value);
150 break;
151
152 case NR_TX:
153 if (copysum) {
154 /*
155 * Use pkt_copy() to copy the portion up to the
156 * point where we need to start the checksum, and
157 * copy the remainder, checksumming as we go.
158 */
159 if (__probable(start != 0)) {
160 _pkt_copy(sbaddr, dbaddr, start);
161 }
162 partial = __packet_copy_and_sum((sbaddr + start),
163 (dbaddr + start), (len - start), 0);
164 csum = __packet_fold_sum_final(partial);
165
166 /* RFC1122 4.1.3.4: Invert 0 to -0 for UDP */
167 if (csum == 0 && invert) {
168 csum = 0xffff;
169 }
170
171 /* Insert checksum into packet */
172 ASSERT(stuff <= (len - sizeof(csum)));
173 if (IS_P2ALIGNED(dbaddr + stuff, sizeof(csum))) {
174 *(uint16_t *)(uintptr_t)(dbaddr + stuff) = csum;
175 } else {
176 bcopy((void *)&csum, dbaddr + stuff,
177 sizeof(csum));
178 }
179 } else {
180 _pkt_copy(sbaddr, dbaddr, len);
181 }
182 dpkt->pkt_csum_flags = spkt->pkt_csum_flags &
183 (PACKET_CSUM_TSO_FLAGS | PACKET_TX_CSUM_OFFLOAD_FLAGS);
184 dpkt->pkt_csum_tx_start_off = 0;
185 dpkt->pkt_csum_tx_stuff_off = 0;
186
187 SK_DF(SK_VERB_COPY | SK_VERB_TX,
188 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u",
189 sk_proc_name_address(current_proc()),
190 sk_proc_pid(current_proc()), len,
191 (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags);
192 break;
193
194 default:
195 VERIFY(0);
196 /* NOTREACHED */
197 __builtin_unreachable();
198 }
199 METADATA_ADJUST_LEN(dpkt, len, doff);
200
201 SK_DF(SK_VERB_COPY | SK_VERB_DUMP, "%s(%d) %s %s",
202 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
203 (t == NR_RX) ? "RX" : "TX",
204 sk_dump("buf", dbaddr, len, 128, NULL, 0));
205 }
206
207 /*
208 * NOTE: soff is the offset within the packet
209 * The accumulated partial sum (32-bit) is returned to caller in csum_partial;
210 * caller is responsible for further reducing it to 16-bit if needed,
211 * as well as to perform the final 1's complement on it.
212 */
213 uint32_t static inline
_pkt_copyaddr_sum(kern_packet_t sph,uint16_t soff,uint8_t * __sized_by (len)dbaddr,uint32_t len,boolean_t do_csum,uint32_t initial_sum,boolean_t * odd_start)214 _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbaddr,
215 uint32_t len, boolean_t do_csum, uint32_t initial_sum, boolean_t *odd_start)
216 {
217 uint8_t odd = 0;
218 uint8_t *sbaddr = NULL;
219 uint32_t sum = initial_sum, partial;
220 uint32_t len0 = len;
221 boolean_t needs_swap, started_on_odd = FALSE;
222 uint16_t sbcnt, off0 = soff;
223 uint32_t clen, sboff, sblen;
224 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
225 kern_buflet_t sbuf = NULL, sbufp = NULL;
226
227 sbcnt = __packet_get_buflet_count(sph);
228
229 if (odd_start) {
230 started_on_odd = *odd_start;
231 }
232
233 /* fastpath (copy+sum, single buflet, even aligned, even length) */
234 if (do_csum && sbcnt == 1 && len != 0) {
235 PKT_GET_NEXT_BUFLET(spkt, 1, sbufp, sbuf);
236 ASSERT(sbuf != NULL);
237 sboff = __buflet_get_data_offset(sbuf);
238 sblen = __buflet_get_data_length(sbuf);
239 ASSERT(sboff <= soff);
240 ASSERT(soff < sboff + sblen);
241 sblen -= (soff - sboff);
242 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
243
244 clen = (uint16_t)MIN(len, sblen);
245
246 if (((uintptr_t)sbaddr & 1) == 0 && clen && (clen & 1) == 0) {
247 sum = __packet_copy_and_sum(sbaddr, dbaddr, clen, sum);
248 return __packet_fold_sum(sum);
249 }
250
251 sbaddr = NULL;
252 sbuf = sbufp = NULL;
253 }
254
255 while (len != 0) {
256 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
257 if (__improbable(sbuf == NULL)) {
258 panic("%s: bad packet, 0x%llx [off %d, len %d]",
259 __func__, SK_KVA(spkt), off0, len0);
260 /* NOTREACHED */
261 __builtin_unreachable();
262 }
263 sbufp = sbuf;
264 sboff = __buflet_get_data_offset(sbuf);
265 sblen = __buflet_get_data_length(sbuf);
266 ASSERT((sboff <= soff) && (soff < sboff + sblen));
267 sblen -= (soff - sboff);
268 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
269 soff = 0;
270 clen = (uint16_t)MIN(len, sblen);
271 if (__probable(do_csum)) {
272 partial = 0;
273 if (__improbable((uintptr_t)sbaddr & 1)) {
274 /* Align on word boundary */
275 started_on_odd = !started_on_odd;
276 #if BYTE_ORDER == LITTLE_ENDIAN
277 partial = (uint8_t)*sbaddr << 8;
278 #else /* BYTE_ORDER != LITTLE_ENDIAN */
279 partial = (uint8_t)*sbaddr;
280 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
281 /*
282 * -fbounds-safety: *dbaddr++ = *sbaddr++ fails
283 * to compile. But the following works. Also,
284 * grouping dbaddr and len updates led to higher
285 * throughput performance, compared to doing
286 * dbaddr++; sbaddr++; len -= 1; in that order.
287 */
288 *dbaddr = *sbaddr;
289 dbaddr++;
290 sblen -= 1;
291 clen -= 1;
292 len -= 1;
293 sbaddr++;
294 }
295 needs_swap = started_on_odd;
296
297 odd = clen & 1u;
298 clen -= odd;
299
300 if (clen != 0) {
301 partial = __packet_copy_and_sum(sbaddr, dbaddr,
302 clen, partial);
303 }
304
305 if (__improbable(partial & 0xc0000000)) {
306 if (needs_swap) {
307 partial = (partial << 8) +
308 (partial >> 24);
309 }
310 sum += (partial >> 16);
311 sum += (partial & 0xffff);
312 partial = 0;
313 }
314 } else {
315 _pkt_copy(sbaddr, dbaddr, clen);
316 }
317
318 dbaddr += clen;
319
320 /*
321 * -fbounds-safety: the following 3 lines were moved up from
322 * after the if-block. None of these are modified in the
323 * if-block, so moving these up here shouldn't change the
324 * behavior. Also, updating len before updating sbaddr led to
325 * faster throughput than doing: dbaddr += clen; sbaddr += clen;
326 * len -= clen + odd;
327 */
328 sblen -= clen + odd;
329 len -= clen + odd;
330 ASSERT(sblen == 0 || len == 0);
331
332 sbaddr += clen;
333
334 if (__probable(do_csum)) {
335 if (odd != 0) {
336 #if BYTE_ORDER == LITTLE_ENDIAN
337 partial += (uint8_t)*sbaddr;
338 #else /* BYTE_ORDER != LITTLE_ENDIAN */
339 partial += (uint8_t)*sbaddr << 8;
340 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
341 *dbaddr++ = *sbaddr++;
342 started_on_odd = !started_on_odd;
343 }
344
345 if (needs_swap) {
346 partial = (partial << 8) + (partial >> 24);
347 }
348 sum += (partial >> 16) + (partial & 0xffff);
349 /*
350 * Reduce sum to allow potential byte swap
351 * in the next iteration without carry.
352 */
353 sum = (sum >> 16) + (sum & 0xffff);
354 }
355 }
356
357 if (odd_start) {
358 *odd_start = started_on_odd;
359 }
360
361 if (__probable(do_csum)) {
362 /* Final fold (reduce 32-bit to 16-bit) */
363 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
364 sum = (sum >> 16) + (sum & 0xffff);
365 }
366 return sum;
367 }
368
369 /*
370 * NOTE: Caller of this function is responsible to adjust the length and offset
371 * of the first buflet of the destination packet if (doff != 0),
372 * i.e. additional data is being prependend to the packet.
373 * It should also finalize the packet.
374 * To simplify & optimize the routine, we have also assumed that soff & doff
375 * will lie within the first buffer, which is true for the current use cases
376 * where, doff is the offset of the checksum field in the TCP/IP header and
377 * soff is the L3 offset.
378 * The accumulated partial sum (32-bit) is returned to caller in csum_partial;
379 * caller is responsible for further reducing it to 16-bit if needed,
380 * as well as to perform the final 1's complement on it.
381 */
382 static inline boolean_t
_pkt_copypkt_sum(kern_packet_t sph,uint16_t soff,kern_packet_t dph,uint16_t doff,uint32_t len,uint32_t * csum_partial,boolean_t do_csum)383 _pkt_copypkt_sum(kern_packet_t sph, uint16_t soff, kern_packet_t dph,
384 uint16_t doff, uint32_t len, uint32_t *csum_partial, boolean_t do_csum)
385 {
386 uint8_t odd = 0;
387 uint32_t sum = 0, partial;
388 boolean_t needs_swap, started_on_odd = FALSE;
389 uint8_t *sbaddr = NULL, *dbaddr = NULL;
390 uint16_t sbcnt, dbcnt;
391 uint32_t clen, dlen0, sboff, sblen, dlim;
392 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
393 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
394 kern_buflet_t sbuf = NULL, sbufp = NULL, dbuf = NULL, dbufp = NULL;
395
396 ASSERT(csum_partial != NULL || !do_csum);
397 sbcnt = __packet_get_buflet_count(sph);
398 dbcnt = __packet_get_buflet_count(dph);
399
400 while (len != 0) {
401 ASSERT(sbaddr == NULL || dbaddr == NULL);
402 if (sbaddr == NULL) {
403 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
404 if (__improbable(sbuf == NULL)) {
405 break;
406 }
407 sbufp = sbuf;
408 sblen = __buflet_get_data_length(sbuf);
409 sboff = __buflet_get_data_offset(sbuf);
410 ASSERT(soff >= sboff);
411 ASSERT(sboff + sblen > soff);
412 sblen -= (soff - sboff);
413 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
414 soff = 0;
415 }
416
417 if (dbaddr == NULL) {
418 if (dbufp != NULL) {
419 __buflet_set_data_length(dbufp, dlen0);
420 }
421
422 PKT_GET_NEXT_BUFLET(dpkt, dbcnt, dbufp, dbuf);
423 if (__improbable(dbuf == NULL)) {
424 break;
425 }
426 dbufp = dbuf;
427 dlim = __buflet_get_data_limit(dbuf);
428 ASSERT(dlim > doff);
429 dlim -= doff;
430 if (doff != 0) {
431 VERIFY(__buflet_set_data_offset(dbuf, doff) == 0);
432 }
433 dbaddr = (uint8_t *)__buflet_get_data_address(dbuf) + doff;
434 dlen0 = dlim;
435 doff = 0;
436 }
437
438 clen = MIN(len, sblen);
439 clen = MIN(clen, dlim);
440
441 if (__probable(do_csum)) {
442 partial = 0;
443 if (__improbable((uintptr_t)sbaddr & 1)) {
444 /* Align on word boundary */
445 started_on_odd = !started_on_odd;
446 #if BYTE_ORDER == LITTLE_ENDIAN
447 partial = (uint8_t)*sbaddr << 8;
448 #else /* BYTE_ORDER != LITTLE_ENDIAN */
449 partial = (uint8_t)*sbaddr;
450 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
451 *dbaddr++ = *sbaddr++;
452 clen -= 1;
453 dlim -= 1;
454 len -= 1;
455 }
456 needs_swap = started_on_odd;
457
458 odd = clen & 1u;
459 clen -= odd;
460
461 if (clen != 0) {
462 partial = __packet_copy_and_sum(sbaddr, dbaddr,
463 clen, partial);
464 }
465
466 if (__improbable(partial & 0xc0000000)) {
467 if (needs_swap) {
468 partial = (partial << 8) +
469 (partial >> 24);
470 }
471 sum += (partial >> 16);
472 sum += (partial & 0xffff);
473 partial = 0;
474 }
475 } else {
476 _pkt_copy(sbaddr, dbaddr, clen);
477 }
478 sbaddr += clen;
479 dbaddr += clen;
480
481 if (__probable(do_csum)) {
482 if (odd != 0) {
483 #if BYTE_ORDER == LITTLE_ENDIAN
484 partial += (uint8_t)*sbaddr;
485 #else /* BYTE_ORDER != LITTLE_ENDIAN */
486 partial += (uint8_t)*sbaddr << 8;
487 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
488 *dbaddr++ = *sbaddr++;
489 started_on_odd = !started_on_odd;
490 }
491
492 if (needs_swap) {
493 partial = (partial << 8) + (partial >> 24);
494 }
495 sum += (partial >> 16) + (partial & 0xffff);
496 /*
497 * Reduce sum to allow potential byte swap
498 * in the next iteration without carry.
499 */
500 sum = (sum >> 16) + (sum & 0xffff);
501 }
502
503 sblen -= clen + odd;
504 dlim -= clen + odd;
505 len -= clen + odd;
506
507 if (sblen == 0) {
508 sbaddr = NULL;
509 }
510
511 if (dlim == 0) {
512 dbaddr = NULL;
513 }
514 }
515
516 if (__probable(dbuf != NULL)) {
517 __buflet_set_data_length(dbuf, (dlen0 - dlim));
518 }
519 if (__probable(do_csum)) {
520 /* Final fold (reduce 32-bit to 16-bit) */
521 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
522 sum = (sum >> 16) + (sum & 0xffff);
523 *csum_partial = (uint32_t)sum;
524 }
525 return len == 0;
526 }
527
528 uint32_t
pkt_sum(kern_packet_t sph,uint16_t soff,uint16_t len)529 pkt_sum(kern_packet_t sph, uint16_t soff, uint16_t len)
530 {
531 uint8_t odd = 0;
532 uint32_t sum = 0, partial;
533 boolean_t needs_swap, started_on_odd = FALSE;
534 uint8_t *sbaddr = NULL;
535 uint16_t sbcnt;
536 uint32_t clen, sblen, sboff;
537 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
538 kern_buflet_t sbuf = NULL, sbufp = NULL;
539
540 sbcnt = __packet_get_buflet_count(sph);
541
542 /* fastpath (single buflet, even aligned, even length) */
543 if (sbcnt == 1 && len != 0) {
544 PKT_GET_NEXT_BUFLET(spkt, 1, sbufp, sbuf);
545 ASSERT(sbuf != NULL);
546 sblen = __buflet_get_data_length(sbuf);
547 sboff = __buflet_get_data_offset(sbuf);
548 ASSERT(soff >= sboff);
549 ASSERT(sboff + sblen > soff);
550 sblen -= (soff - sboff);
551 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
552
553 clen = MIN(len, sblen);
554
555 if (((uintptr_t)sbaddr & 1) == 0 && clen && (clen & 1) == 0) {
556 sum = __packet_cksum(sbaddr, clen, 0);
557 return __packet_fold_sum(sum);
558 }
559
560 sbaddr = NULL;
561 sbuf = sbufp = NULL;
562 }
563
564 /* slowpath */
565 while (len != 0) {
566 ASSERT(sbaddr == NULL);
567 if (sbaddr == NULL) {
568 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
569 if (__improbable(sbuf == NULL)) {
570 break;
571 }
572 sbufp = sbuf;
573 sblen = __buflet_get_data_length(sbuf);
574 sboff = __buflet_get_data_offset(sbuf);
575 ASSERT(soff >= sboff);
576 ASSERT(sboff + sblen > soff);
577 sblen -= (soff - sboff);
578 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
579 soff = 0;
580 }
581
582 clen = MIN(len, sblen);
583
584 partial = 0;
585 if (__improbable((uintptr_t)sbaddr & 1)) {
586 /* Align on word boundary */
587 started_on_odd = !started_on_odd;
588 #if BYTE_ORDER == LITTLE_ENDIAN
589 partial = (uint8_t)*sbaddr << 8;
590 #else /* BYTE_ORDER != LITTLE_ENDIAN */
591 partial = (uint8_t)*sbaddr;
592 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
593 clen -= 1;
594 len -= 1;
595 }
596 needs_swap = started_on_odd;
597
598 odd = clen & 1u;
599 clen -= odd;
600
601 if (clen != 0) {
602 partial = __packet_cksum(sbaddr,
603 clen, partial);
604 }
605
606 if (__improbable(partial & 0xc0000000)) {
607 if (needs_swap) {
608 partial = (partial << 8) +
609 (partial >> 24);
610 }
611 sum += (partial >> 16);
612 sum += (partial & 0xffff);
613 partial = 0;
614 }
615 sbaddr += clen;
616
617 if (odd != 0) {
618 #if BYTE_ORDER == LITTLE_ENDIAN
619 partial += (uint8_t)*sbaddr;
620 #else /* BYTE_ORDER != LITTLE_ENDIAN */
621 partial += (uint8_t)*sbaddr << 8;
622 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
623 started_on_odd = !started_on_odd;
624 }
625
626 if (needs_swap) {
627 partial = (partial << 8) + (partial >> 24);
628 }
629 sum += (partial >> 16) + (partial & 0xffff);
630 /*
631 * Reduce sum to allow potential byte swap
632 * in the next iteration without carry.
633 */
634 sum = (sum >> 16) + (sum & 0xffff);
635
636 sblen -= clen + odd;
637 len -= clen + odd;
638
639 if (sblen == 0) {
640 sbaddr = NULL;
641 }
642 }
643
644 /* Final fold (reduce 32-bit to 16-bit) */
645 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
646 sum = (sum >> 16) + (sum & 0xffff);
647 return (uint32_t)sum;
648 }
649
650
651 /*
652 * This is a multi-buflet variant of pkt_copy_from_pkt().
653 *
654 * start/stuff is relative to soff, within [0, len], such that
655 * [ 0 ... soff ... soff + start/stuff ... soff + len ... ]
656 */
657 void
pkt_copy_multi_buflet_from_pkt(const enum txrx t,kern_packet_t dph,const uint16_t doff,kern_packet_t sph,const uint16_t soff,const uint32_t len,const boolean_t copysum,const uint16_t start,const uint16_t stuff,const boolean_t invert)658 pkt_copy_multi_buflet_from_pkt(const enum txrx t, kern_packet_t dph,
659 const uint16_t doff, kern_packet_t sph, const uint16_t soff,
660 const uint32_t len, const boolean_t copysum, const uint16_t start,
661 const uint16_t stuff, const boolean_t invert)
662 {
663 boolean_t rc;
664 uint32_t partial;
665 uint16_t csum = 0;
666 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
667 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
668 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
669
670 VERIFY((doff + len) <= (PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp) *
671 __packet_get_buflet_count(dph)));
672
673 switch (t) {
674 case NR_RX:
675 dpkt->pkt_csum_flags = 0;
676 if (__probable(do_sum)) {
677 /*
678 * copy the portion up to the point where we need to
679 * start the checksum, and copy the remainder,
680 * checksumming as we go.
681 */
682 if (__probable(start != 0)) {
683 rc = _pkt_copypkt_sum(sph, soff, dph, doff,
684 start, NULL, FALSE);
685 ASSERT(rc);
686 }
687 _pkt_copypkt_sum(sph, (soff + start), dph,
688 (doff + start), (len - start), &partial, TRUE);
689 csum = __packet_fold_sum(partial);
690 __packet_set_inet_checksum(dph, PACKET_CSUM_PARTIAL,
691 start, csum, FALSE);
692 METADATA_ADJUST_LEN(dpkt, start, doff);
693 } else {
694 rc = _pkt_copypkt_sum(sph, soff, dph, doff, len, NULL,
695 FALSE);
696 ASSERT(rc);
697 dpkt->pkt_csum_rx_start_off = spkt->pkt_csum_rx_start_off;
698 dpkt->pkt_csum_rx_value = spkt->pkt_csum_rx_value;
699 dpkt->pkt_csum_flags |= spkt->pkt_csum_flags & PACKET_CSUM_RX_FLAGS;
700 }
701 break;
702
703 case NR_TX:
704 if (copysum) {
705 uint8_t *baddr;
706 /*
707 * copy the portion up to the point where we need to
708 * start the checksum, and copy the remainder,
709 * checksumming as we go.
710 */
711 if (__probable(start != 0)) {
712 rc = _pkt_copypkt_sum(sph, soff, dph, doff,
713 start, NULL, FALSE);
714 ASSERT(rc);
715 }
716 rc = _pkt_copypkt_sum(sph, (soff + start), dph,
717 (doff + start), (len - start), &partial, TRUE);
718 ASSERT(rc);
719 csum = __packet_fold_sum_final(partial);
720
721 /* RFC1122 4.1.3.4: Invert 0 to -0 for UDP */
722 if (csum == 0 && invert) {
723 csum = 0xffff;
724 }
725
726 /*
727 * Insert checksum into packet.
728 * Here we assume that checksum will be in the
729 * first buffer.
730 */
731 ASSERT((stuff + doff + sizeof(csum)) <=
732 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
733 ASSERT(stuff <= (len - sizeof(csum)));
734
735 /* get first buflet buffer address from packet */
736 MD_BUFLET_ADDR_ABS(dpkt, baddr);
737 ASSERT(baddr != NULL);
738 baddr += doff;
739 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
740 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
741 } else {
742 bcopy((void *)&csum, baddr + stuff,
743 sizeof(csum));
744 }
745 METADATA_ADJUST_LEN(dpkt, start, doff);
746 } else {
747 rc = _pkt_copypkt_sum(sph, soff, dph, doff, len, NULL,
748 FALSE);
749 ASSERT(rc);
750 }
751 dpkt->pkt_csum_flags = spkt->pkt_csum_flags &
752 (PACKET_CSUM_TSO_FLAGS | PACKET_TX_CSUM_OFFLOAD_FLAGS);
753 dpkt->pkt_csum_tx_start_off = 0;
754 dpkt->pkt_csum_tx_stuff_off = 0;
755
756 SK_DF(SK_VERB_COPY | SK_VERB_TX,
757 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u",
758 sk_proc_name_address(current_proc()),
759 sk_proc_pid(current_proc()), len,
760 (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags);
761 break;
762
763 default:
764 VERIFY(0);
765 /* NOTREACHED */
766 __builtin_unreachable();
767 }
768 }
769
770 static inline uint32_t
_convert_mbuf_csum_flags(uint32_t mbuf_flags)771 _convert_mbuf_csum_flags(uint32_t mbuf_flags)
772 {
773 uint32_t pkt_flags = 0;
774
775 if (mbuf_flags & CSUM_TCP) {
776 pkt_flags |= PACKET_CSUM_TCP;
777 }
778 if (mbuf_flags & CSUM_TCPIPV6) {
779 pkt_flags |= PACKET_CSUM_TCPIPV6;
780 }
781 if (mbuf_flags & CSUM_UDP) {
782 pkt_flags |= PACKET_CSUM_UDP;
783 }
784 if (mbuf_flags & CSUM_UDPIPV6) {
785 pkt_flags |= PACKET_CSUM_UDPIPV6;
786 }
787 if (mbuf_flags & CSUM_IP) {
788 pkt_flags |= PACKET_CSUM_IP;
789 }
790 if (mbuf_flags & CSUM_ZERO_INVERT) {
791 pkt_flags |= PACKET_CSUM_ZERO_INVERT;
792 }
793
794 return pkt_flags;
795 }
796
797 /*
798 * This routine is used for copying an mbuf which originated in the host
799 * stack destined to a native skywalk interface (NR_TX), as well as for
800 * mbufs originating on compat network interfaces (NR_RX).
801 *
802 * start/stuff is relative to moff, within [0, len], such that
803 * [ 0 ... moff ... moff + start/stuff ... moff + len ... ]
804 */
805 void
pkt_copy_from_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)806 pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff,
807 struct mbuf *m, const uint16_t moff, const uint32_t len,
808 const boolean_t copysum, const uint16_t start)
809 {
810 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
811 struct m_tag *ts_tag = NULL;
812 uint32_t partial;
813 uint16_t csum = 0;
814 uint8_t *baddr;
815
816 _CASSERT(sizeof(csum) == sizeof(uint16_t));
817
818 /* get buffer address from packet */
819 MD_BUFLET_ADDR_ABS(pkt, baddr);
820 ASSERT(baddr != NULL);
821 baddr += poff;
822 VERIFY((poff + len) <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
823
824 switch (t) {
825 case NR_RX:
826 pkt->pkt_csum_flags = m->m_pkthdr.csum_flags;
827 pkt->pkt_csum_rx_start_off = 0;
828 pkt->pkt_csum_rx_value = m->m_pkthdr.csum_rx_val;
829 pkt->pkt_svc_class = m_get_service_class(m);
830 if (__probable(((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS)
831 != CSUM_RX_FULL_FLAGS) && copysum)) {
832 /*
833 * Use m_copydata() to copy the portion up to the
834 * point where we need to start the checksum, and
835 * copy the remainder, checksumming as we go.
836 */
837 if (start != 0) {
838 m_copydata(m, moff, start, baddr);
839 }
840 partial = m_copydata_sum(m, start, (len - start),
841 (baddr + start), 0, NULL);
842 csum = __packet_fold_sum(partial);
843
844 __packet_set_inet_checksum(ph, PACKET_CSUM_PARTIAL,
845 start, csum, FALSE);
846 } else {
847 m_copydata(m, moff, len, baddr);
848 }
849 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
850 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
851 sk_proc_name_address(current_proc()),
852 sk_proc_pid(current_proc()), len,
853 (copysum ? (len - start) : 0), csum, start);
854 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
855 " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x",
856 SK_KVA(m), m->m_pkthdr.csum_flags,
857 (uint32_t)m->m_pkthdr.csum_rx_start,
858 (uint32_t)m->m_pkthdr.csum_rx_val);
859 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
860 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
861 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
862 (uint32_t)pkt->pkt_csum_rx_start_off,
863 (uint32_t)pkt->pkt_csum_rx_value);
864 break;
865
866 case NR_TX:
867 if (copysum) {
868 uint16_t stuff = m->m_pkthdr.csum_tx_stuff;
869 /*
870 * Use m_copydata() to copy the portion up to the
871 * point where we need to start the checksum, and
872 * copy the remainder, checksumming as we go.
873 */
874 if (start != 0) {
875 m_copydata(m, moff, start, baddr);
876 }
877 partial = m_copydata_sum(m, start, (len - start),
878 (baddr + start), 0, NULL);
879 csum = __packet_fold_sum_final(partial);
880
881 /*
882 * RFC1122 4.1.3.4: Invert 0 to -0 for UDP;
883 * ideally we'd only test for CSUM_ZERO_INVERT
884 * here, but catch cases where the originator
885 * did not set it for UDP.
886 */
887 if (csum == 0 && (m->m_pkthdr.csum_flags &
888 (CSUM_UDP | CSUM_UDPIPV6 | CSUM_ZERO_INVERT))) {
889 csum = 0xffff;
890 }
891
892 /* Insert checksum into packet */
893 ASSERT(stuff <= (len - sizeof(csum)));
894 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
895 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
896 } else {
897 bcopy((void *)&csum, baddr + stuff,
898 sizeof(csum));
899 }
900 } else {
901 m_copydata(m, moff, len, baddr);
902 }
903 pkt->pkt_csum_flags = 0;
904 pkt->pkt_csum_tx_start_off = 0;
905 pkt->pkt_csum_tx_stuff_off = 0;
906
907 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
908 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV4;
909 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
910 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV6) == 0);
911 }
912 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) {
913 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV6;
914 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
915 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV4) == 0);
916 }
917 if (!copysum) {
918 pkt->pkt_csum_flags |= _convert_mbuf_csum_flags(m->m_pkthdr.csum_flags);
919 }
920
921 /* translate mbuf metadata */
922 pkt->pkt_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
923 pkt->pkt_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
924 pkt->pkt_flow_token = m->m_pkthdr.pkt_flowid;
925 pkt->pkt_comp_gencnt = m->m_pkthdr.comp_gencnt;
926 switch (m->m_pkthdr.pkt_proto) {
927 case IPPROTO_QUIC:
928 pkt->pkt_flow_ip_proto = IPPROTO_UDP;
929 pkt->pkt_transport_protocol = IPPROTO_QUIC;
930 break;
931
932 default:
933 pkt->pkt_flow_ip_proto = m->m_pkthdr.pkt_proto;
934 pkt->pkt_transport_protocol = m->m_pkthdr.pkt_proto;
935 break;
936 }
937 (void) mbuf_get_timestamp(m, &pkt->pkt_timestamp, NULL);
938 pkt->pkt_svc_class = m_get_service_class(m);
939 pkt->pkt_pflags &= ~PKT_F_COMMON_MASK;
940 pkt->pkt_pflags |= (m->m_pkthdr.pkt_flags & PKT_F_COMMON_MASK);
941 if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) {
942 pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq);
943 }
944 if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) {
945 pkt->pkt_pflags |= PKT_F_L4S;
946 }
947 necp_get_app_uuid_from_packet(m, pkt->pkt_policy_euuid);
948 pkt->pkt_policy_id =
949 (uint32_t)necp_get_policy_id_from_packet(m);
950 pkt->pkt_skip_policy_id =
951 (uint32_t)necp_get_skip_policy_id_from_packet(m);
952
953 if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) != 0) {
954 if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) != 0) {
955 __packet_set_tx_completion_data(ph,
956 m->m_pkthdr.drv_tx_compl_arg,
957 m->m_pkthdr.drv_tx_compl_data);
958 }
959 pkt->pkt_tx_compl_context =
960 m->m_pkthdr.pkt_compl_context;
961 pkt->pkt_tx_compl_callbacks =
962 m->m_pkthdr.pkt_compl_callbacks;
963 /*
964 * Remove PKTF_TX_COMPL_TS_REQ flag so that this
965 * mbuf can no longer trigger a completion callback.
966 * callback will be invoked when the kernel packet is
967 * completed.
968 */
969 m->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
970
971 m_add_crumb(m, PKT_CRUMB_SK_PKT_COPY);
972 }
973
974 ts_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM);
975 if (ts_tag != NULL) {
976 __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data));
977 }
978
979 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
980 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
981 sk_proc_name_address(current_proc()),
982 sk_proc_pid(current_proc()), len,
983 (copysum ? (len - start) : 0), csum, start);
984 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
985 " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u",
986 SK_KVA(m), m->m_pkthdr.csum_flags,
987 (uint32_t)m->m_pkthdr.csum_tx_start,
988 (uint32_t)m->m_pkthdr.csum_tx_stuff);
989 break;
990
991 default:
992 VERIFY(0);
993 /* NOTREACHED */
994 __builtin_unreachable();
995 }
996 METADATA_ADJUST_LEN(pkt, len, poff);
997
998 if (m->m_flags & M_BCAST) {
999 __packet_set_link_broadcast(ph);
1000 } else if (m->m_flags & M_MCAST) {
1001 __packet_set_link_multicast(ph);
1002 }
1003
1004 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1005 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1006 (t == NR_RX) ? "RX" : "TX",
1007 sk_dump("buf", baddr, len, 128, NULL, 0));
1008 }
1009
1010 /*
1011 * Like m_copydata_sum(), but works on a destination kernel packet.
1012 */
1013 static inline uint32_t
m_copypkt_sum(mbuf_t m,int soff,kern_packet_t dph,uint16_t doff,uint32_t len,boolean_t do_cscum)1014 m_copypkt_sum(mbuf_t m, int soff, kern_packet_t dph, uint16_t doff,
1015 uint32_t len, boolean_t do_cscum)
1016 {
1017 boolean_t needs_swap, started_on_odd = FALSE;
1018 int off0 = soff;
1019 uint32_t len0 = len;
1020 struct mbuf *m0 = m;
1021 uint32_t sum = 0, partial;
1022 unsigned count0, count, odd, mlen_copied;
1023 uint8_t *sbaddr = NULL, *dbaddr = NULL;
1024 uint16_t dbcnt = __packet_get_buflet_count(dph);
1025 uint32_t dlim, dlen0;
1026 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
1027 kern_buflet_t dbuf = NULL, dbufp = NULL;
1028
1029 while (soff > 0) {
1030 if (__improbable(m == NULL)) {
1031 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1032 __func__, m0, off0, len0);
1033 /* NOTREACHED */
1034 __builtin_unreachable();
1035 }
1036 if (soff < m->m_len) {
1037 break;
1038 }
1039 soff -= m->m_len;
1040 m = m->m_next;
1041 }
1042
1043 if (__improbable(m == NULL)) {
1044 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1045 __func__, m0, off0, len0);
1046 /* NOTREACHED */
1047 __builtin_unreachable();
1048 }
1049
1050 sbaddr = mtod(m, uint8_t *) + soff;
1051 count = m->m_len - soff;
1052 mlen_copied = 0;
1053
1054 while (len != 0) {
1055 ASSERT(sbaddr == NULL || dbaddr == NULL);
1056 if (sbaddr == NULL) {
1057 soff = 0;
1058 m = m->m_next;
1059 if (__improbable(m == NULL)) {
1060 panic("%s: invalid mbuf chain %p [off %d, "
1061 "len %d]", __func__, m0, off0, len0);
1062 /* NOTREACHED */
1063 __builtin_unreachable();
1064 }
1065 sbaddr = mtod(m, uint8_t *);
1066 count = m->m_len;
1067 mlen_copied = 0;
1068 }
1069
1070 if (__improbable(count == 0)) {
1071 sbaddr = NULL;
1072 continue;
1073 }
1074
1075 if (dbaddr == NULL) {
1076 if (dbufp != NULL) {
1077 __buflet_set_data_length(dbufp, dlen0);
1078 }
1079
1080 PKT_GET_NEXT_BUFLET(dpkt, dbcnt, dbufp, dbuf);
1081 if (__improbable(dbuf == NULL)) {
1082 panic("%s: mbuf too large %p [off %d, "
1083 "len %d]", __func__, m0, off0, len0);
1084 /* NOTREACHED */
1085 __builtin_unreachable();
1086 }
1087 dbufp = dbuf;
1088 dlim = __buflet_get_data_limit(dbuf) - doff;
1089 dbaddr = (uint8_t *)__buflet_get_data_address(dbuf) + doff;
1090 dlen0 = dlim;
1091 doff = 0;
1092 }
1093
1094 count = MIN(count, (unsigned)len);
1095 count0 = count = MIN(count, dlim);
1096
1097 if (!do_cscum) {
1098 _pkt_copy(sbaddr, dbaddr, count);
1099 sbaddr += count;
1100 dbaddr += count;
1101 goto skip_csum;
1102 }
1103
1104 partial = 0;
1105 if ((uintptr_t)sbaddr & 1) {
1106 /* Align on word boundary */
1107 started_on_odd = !started_on_odd;
1108 #if BYTE_ORDER == LITTLE_ENDIAN
1109 partial = *sbaddr << 8;
1110 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1111 partial = *sbaddr;
1112 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1113 *dbaddr++ = *sbaddr++;
1114 count -= 1;
1115 }
1116
1117 needs_swap = started_on_odd;
1118 odd = count & 1u;
1119 count -= odd;
1120
1121 if (count) {
1122 partial = __packet_copy_and_sum(sbaddr,
1123 dbaddr, count, partial);
1124 sbaddr += count;
1125 dbaddr += count;
1126 if (__improbable(partial & 0xc0000000)) {
1127 if (needs_swap) {
1128 partial = (partial << 8) +
1129 (partial >> 24);
1130 }
1131 sum += (partial >> 16);
1132 sum += (partial & 0xffff);
1133 partial = 0;
1134 }
1135 }
1136
1137 if (odd) {
1138 #if BYTE_ORDER == LITTLE_ENDIAN
1139 partial += *sbaddr;
1140 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1141 partial += *sbaddr << 8;
1142 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1143 *dbaddr++ = *sbaddr++;
1144 started_on_odd = !started_on_odd;
1145 }
1146
1147 if (needs_swap) {
1148 partial = (partial << 8) + (partial >> 24);
1149 }
1150 sum += (partial >> 16) + (partial & 0xffff);
1151 /*
1152 * Reduce sum to allow potential byte swap
1153 * in the next iteration without carry.
1154 */
1155 sum = (sum >> 16) + (sum & 0xffff);
1156
1157 skip_csum:
1158 dlim -= count0;
1159 len -= count0;
1160 mlen_copied += count0;
1161
1162 if (dlim == 0) {
1163 dbaddr = NULL;
1164 }
1165
1166 count = m->m_len - soff - mlen_copied;
1167 if (count == 0) {
1168 sbaddr = NULL;
1169 }
1170 }
1171
1172 ASSERT(len == 0);
1173 ASSERT(dbuf != NULL);
1174 __buflet_set_data_length(dbuf, (dlen0 - dlim));
1175
1176 if (!do_cscum) {
1177 return 0;
1178 }
1179
1180 /* Final fold (reduce 32-bit to 16-bit) */
1181 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
1182 sum = (sum >> 16) + (sum & 0xffff);
1183 return sum;
1184 }
1185
1186 /*
1187 * This is a multi-buflet variant of pkt_copy_from_mbuf().
1188 *
1189 * start/stuff is relative to moff, within [0, len], such that
1190 * [ 0 ... moff ... moff + start/stuff ... moff + len ... ]
1191 */
1192 void
pkt_copy_multi_buflet_from_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1193 pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph,
1194 const uint16_t poff, struct mbuf *m, const uint16_t moff,
1195 const uint32_t len, const boolean_t copysum, const uint16_t start)
1196 {
1197 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1198 struct m_tag *ts_tag = NULL;
1199 uint32_t partial;
1200 uint16_t csum = 0;
1201 uint8_t *baddr;
1202
1203 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1204
1205 /* get buffer address from packet */
1206 MD_BUFLET_ADDR_ABS(pkt, baddr);
1207 ASSERT(baddr != NULL);
1208 baddr += poff;
1209 VERIFY((poff + len) <= (PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp) *
1210 __packet_get_buflet_count(ph)));
1211
1212 switch (t) {
1213 case NR_RX:
1214 pkt->pkt_csum_flags = m->m_pkthdr.csum_flags;
1215 pkt->pkt_csum_rx_start_off = 0;
1216 pkt->pkt_csum_rx_value = m->m_pkthdr.csum_rx_val;
1217 pkt->pkt_svc_class = m_get_service_class(m);
1218 if (__probable(((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS)
1219 != CSUM_RX_FULL_FLAGS) && copysum)) {
1220 /*
1221 * Use m_copydata() to copy the portion up to the
1222 * point where we need to start the checksum, and
1223 * copy the remainder, checksumming as we go.
1224 */
1225 if (start != 0) {
1226 m_copydata(m, moff, start, baddr);
1227 }
1228 partial = m_copypkt_sum(m, start, ph, (poff + start),
1229 (len - start), TRUE);
1230 csum = __packet_fold_sum(partial);
1231 __packet_set_inet_checksum(ph, PACKET_CSUM_PARTIAL,
1232 start, csum, FALSE);
1233 METADATA_ADJUST_LEN(pkt, start, poff);
1234 } else {
1235 (void) m_copypkt_sum(m, moff, ph, poff, len, FALSE);
1236 }
1237 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1238 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1239 sk_proc_name_address(current_proc()),
1240 sk_proc_pid(current_proc()), len,
1241 (copysum ? (len - start) : 0), csum, start);
1242 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1243 " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x",
1244 SK_KVA(m), m->m_pkthdr.csum_flags,
1245 (uint32_t)m->m_pkthdr.csum_rx_start,
1246 (uint32_t)m->m_pkthdr.csum_rx_val);
1247 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1248 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1249 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1250 (uint32_t)pkt->pkt_csum_rx_start_off,
1251 (uint32_t)pkt->pkt_csum_rx_value);
1252 break;
1253
1254 case NR_TX:
1255 if (copysum) {
1256 uint16_t stuff = m->m_pkthdr.csum_tx_stuff;
1257 /*
1258 * Use m_copydata() to copy the portion up to the
1259 * point where we need to start the checksum, and
1260 * copy the remainder, checksumming as we go.
1261 */
1262 if (start != 0) {
1263 m_copydata(m, moff, start, baddr);
1264 }
1265 partial = m_copypkt_sum(m, start, ph, (poff + start),
1266 (len - start), TRUE);
1267 csum = __packet_fold_sum_final(partial);
1268
1269 /*
1270 * RFC1122 4.1.3.4: Invert 0 to -0 for UDP;
1271 * ideally we'd only test for CSUM_ZERO_INVERT
1272 * here, but catch cases where the originator
1273 * did not set it for UDP.
1274 */
1275 if (csum == 0 && (m->m_pkthdr.csum_flags &
1276 (CSUM_UDP | CSUM_UDPIPV6 | CSUM_ZERO_INVERT))) {
1277 csum = 0xffff;
1278 }
1279
1280 /* Insert checksum into packet */
1281 ASSERT(stuff <= (len - sizeof(csum)));
1282 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
1283 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
1284 } else {
1285 bcopy((void *)&csum, baddr + stuff,
1286 sizeof(csum));
1287 }
1288 METADATA_ADJUST_LEN(pkt, start, poff);
1289 } else {
1290 m_copypkt_sum(m, moff, ph, poff, len, FALSE);
1291 }
1292 pkt->pkt_csum_flags = 0;
1293 pkt->pkt_csum_tx_start_off = 0;
1294 pkt->pkt_csum_tx_stuff_off = 0;
1295
1296 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1297 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV4;
1298 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
1299 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV6) == 0);
1300 }
1301 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) {
1302 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV6;
1303 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
1304 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV4) == 0);
1305 }
1306 if (!copysum) {
1307 pkt->pkt_csum_flags |= _convert_mbuf_csum_flags(m->m_pkthdr.csum_flags);
1308 }
1309
1310 /* translate mbuf metadata */
1311 pkt->pkt_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
1312 pkt->pkt_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
1313 pkt->pkt_flow_token = m->m_pkthdr.pkt_flowid;
1314 pkt->pkt_comp_gencnt = m->m_pkthdr.comp_gencnt;
1315 switch (m->m_pkthdr.pkt_proto) {
1316 case IPPROTO_QUIC:
1317 pkt->pkt_flow_ip_proto = IPPROTO_UDP;
1318 pkt->pkt_transport_protocol = IPPROTO_QUIC;
1319 break;
1320
1321 default:
1322 pkt->pkt_flow_ip_proto = m->m_pkthdr.pkt_proto;
1323 pkt->pkt_transport_protocol = m->m_pkthdr.pkt_proto;
1324 break;
1325 }
1326 (void) mbuf_get_timestamp(m, &pkt->pkt_timestamp, NULL);
1327 pkt->pkt_svc_class = m_get_service_class(m);
1328 pkt->pkt_pflags &= ~PKT_F_COMMON_MASK;
1329 pkt->pkt_pflags |= (m->m_pkthdr.pkt_flags & PKT_F_COMMON_MASK);
1330 if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) {
1331 pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq);
1332 }
1333 if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) {
1334 pkt->pkt_pflags |= PKT_F_L4S;
1335 }
1336 necp_get_app_uuid_from_packet(m, pkt->pkt_policy_euuid);
1337 pkt->pkt_policy_id =
1338 (uint32_t)necp_get_policy_id_from_packet(m);
1339 pkt->pkt_skip_policy_id =
1340 (uint32_t)necp_get_skip_policy_id_from_packet(m);
1341
1342 if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) != 0) {
1343 if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) != 0) {
1344 __packet_set_tx_completion_data(ph,
1345 m->m_pkthdr.drv_tx_compl_arg,
1346 m->m_pkthdr.drv_tx_compl_data);
1347 }
1348 pkt->pkt_tx_compl_context =
1349 m->m_pkthdr.pkt_compl_context;
1350 pkt->pkt_tx_compl_callbacks =
1351 m->m_pkthdr.pkt_compl_callbacks;
1352 /*
1353 * Remove PKTF_TX_COMPL_TS_REQ flag so that this
1354 * mbuf can no longer trigger a completion callback.
1355 * callback will be invoked when the kernel packet is
1356 * completed.
1357 */
1358 m->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
1359
1360 m_add_crumb(m, PKT_CRUMB_SK_PKT_COPY);
1361 }
1362
1363 ts_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM);
1364 if (ts_tag != NULL) {
1365 __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data));
1366 }
1367
1368 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1369 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1370 sk_proc_name_address(current_proc()),
1371 sk_proc_pid(current_proc()), len,
1372 (copysum ? (len - start) : 0), csum, start);
1373 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1374 " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u",
1375 SK_KVA(m), m->m_pkthdr.csum_flags,
1376 (uint32_t)m->m_pkthdr.csum_tx_start,
1377 (uint32_t)m->m_pkthdr.csum_tx_stuff);
1378 break;
1379
1380 default:
1381 VERIFY(0);
1382 /* NOTREACHED */
1383 __builtin_unreachable();
1384 }
1385
1386 if (m->m_flags & M_BCAST) {
1387 __packet_set_link_broadcast(ph);
1388 } else if (m->m_flags & M_MCAST) {
1389 __packet_set_link_multicast(ph);
1390 }
1391
1392 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1393 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1394 (t == NR_RX) ? "RX" : "TX",
1395 sk_dump("buf", baddr, len, 128, NULL, 0));
1396 }
1397
1398 static inline uint32_t
_convert_pkt_csum_flags(uint32_t pkt_flags)1399 _convert_pkt_csum_flags(uint32_t pkt_flags)
1400 {
1401 uint32_t mbuf_flags = 0;
1402 if (pkt_flags & PACKET_CSUM_TCP) {
1403 mbuf_flags |= CSUM_TCP;
1404 }
1405 if (pkt_flags & PACKET_CSUM_TCPIPV6) {
1406 mbuf_flags |= CSUM_TCPIPV6;
1407 }
1408 if (pkt_flags & PACKET_CSUM_UDP) {
1409 mbuf_flags |= CSUM_UDP;
1410 }
1411 if (pkt_flags & PACKET_CSUM_UDPIPV6) {
1412 mbuf_flags |= CSUM_UDPIPV6;
1413 }
1414 if (pkt_flags & PACKET_CSUM_IP) {
1415 mbuf_flags |= CSUM_IP;
1416 }
1417 if (pkt_flags & PACKET_CSUM_ZERO_INVERT) {
1418 mbuf_flags |= CSUM_ZERO_INVERT;
1419 }
1420
1421 return mbuf_flags;
1422 }
1423
1424 /*
1425 * This routine is used for copying from a packet originating from a native
1426 * skywalk interface to an mbuf destined for the host legacy stack (NR_RX),
1427 * as well as for mbufs destined for the compat network interfaces (NR_TX).
1428 *
1429 * We do adjust the length to reflect the total data span.
1430 *
1431 * This routine supports copying into an mbuf chain for RX but not TX.
1432 *
1433 * start/stuff is relative to poff, within [0, len], such that
1434 * [ 0 ... poff ... poff + start/stuff ... poff + len ... ]
1435 */
1436 void
pkt_copy_to_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1437 pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff,
1438 struct mbuf *m, const uint16_t moff, const uint32_t len,
1439 const boolean_t copysum, const uint16_t start)
1440 {
1441 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1442 struct mbuf *curr_m;
1443 uint32_t partial = 0;
1444 uint32_t remaining_len = len, copied_len = 0;
1445 uint16_t csum = 0;
1446 uint8_t *baddr;
1447 uint8_t *dp;
1448 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt);
1449
1450 ASSERT(len >= start);
1451 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1452
1453 /* get buffer address from packet */
1454 MD_BUFLET_ADDR_ABS(pkt, baddr);
1455 ASSERT(baddr != NULL);
1456 baddr += poff;
1457 VERIFY((poff + len) <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
1458
1459 ASSERT((m->m_flags & M_PKTHDR));
1460 m->m_data += moff;
1461
1462 switch (t) {
1463 case NR_RX:
1464 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
1465
1466 /*
1467 * Use pkt_copy() to copy the portion up to the
1468 * point where we need to start the checksum, and
1469 * copy the remainder, checksumming as we go.
1470 */
1471 if (__probable(do_sum && start != 0)) {
1472 ASSERT(M_TRAILINGSPACE(m) >= start);
1473 ASSERT(m->m_len == 0);
1474 dp = (uint8_t *)m_mtod_current(m);
1475 _pkt_copy(baddr, dp, start);
1476 remaining_len -= start;
1477 copied_len += start;
1478 m->m_len += start;
1479 m->m_pkthdr.len += start;
1480 }
1481 curr_m = m;
1482 while (curr_m != NULL && remaining_len != 0) {
1483 uint32_t tmp_len = MIN(remaining_len,
1484 (uint32_t)M_TRAILINGSPACE(curr_m));
1485 dp = (uint8_t *)m_mtod_end(curr_m);
1486 if (__probable(do_sum)) {
1487 partial = __packet_copy_and_sum((baddr + copied_len),
1488 dp, tmp_len, partial);
1489 } else {
1490 _pkt_copy((baddr + copied_len), dp, tmp_len);
1491 }
1492
1493 curr_m->m_len += tmp_len;
1494 m->m_pkthdr.len += tmp_len;
1495 copied_len += tmp_len;
1496 remaining_len -= tmp_len;
1497 curr_m = curr_m->m_next;
1498 }
1499 ASSERT(remaining_len == 0);
1500
1501 if (__probable(do_sum)) {
1502 csum = __packet_fold_sum(partial);
1503
1504 m->m_pkthdr.csum_flags |=
1505 (CSUM_DATA_VALID | CSUM_PARTIAL);
1506 m->m_pkthdr.csum_rx_start = start;
1507 m->m_pkthdr.csum_rx_val = csum;
1508 } else {
1509 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
1510 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
1511 _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS);
1512 m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS;
1513 if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) {
1514 m->m_pkthdr.csum_flags |= CSUM_PARTIAL;
1515 }
1516 }
1517
1518 /* translate packet metadata */
1519 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1520 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1521
1522 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1523 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1524 sk_proc_name_address(current_proc()),
1525 sk_proc_pid(current_proc()), len,
1526 (copysum ? (len - start) : 0), csum, start);
1527 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1528 " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1529 SK_KVA(m), moff, m->m_pkthdr.csum_flags,
1530 (uint32_t)m->m_pkthdr.csum_rx_start,
1531 (uint32_t)m->m_pkthdr.csum_rx_val);
1532 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1533 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1534 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1535 (uint32_t)pkt->pkt_csum_rx_start_off,
1536 (uint32_t)pkt->pkt_csum_rx_value);
1537 break;
1538
1539 case NR_TX:
1540 dp = (uint8_t *)m_mtod_current(m);
1541 ASSERT(m->m_next == NULL);
1542
1543 VERIFY(((intptr_t)dp - (intptr_t)mbuf_datastart(m)) + len <=
1544 (uint32_t)mbuf_maxlen(m));
1545 m->m_len += len;
1546 m->m_pkthdr.len += len;
1547 VERIFY(m->m_len == m->m_pkthdr.len &&
1548 (uint32_t)m->m_len <= (uint32_t)mbuf_maxlen(m));
1549
1550 if (copysum) {
1551 uint16_t stuff = pkt->pkt_csum_tx_stuff_off;
1552 /*
1553 * Use pkt_copy() to copy the portion up to the
1554 * point where we need to start the checksum, and
1555 * copy the remainder, checksumming as we go.
1556 */
1557 if (__probable(start != 0)) {
1558 _pkt_copy(baddr, dp, start);
1559 }
1560 partial = __packet_copy_and_sum((baddr + start),
1561 (dp + start), (len - start), 0);
1562 csum = __packet_fold_sum_final(partial);
1563
1564 /* RFC1122 4.1.3.4: Invert 0 to -0 (for UDP) */
1565 if (csum == 0 &&
1566 (pkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)) {
1567 csum = 0xffff;
1568 }
1569
1570 /* Insert checksum into packet */
1571 ASSERT(stuff <= (len - sizeof(csum)));
1572 if (IS_P2ALIGNED(dp + stuff, sizeof(csum))) {
1573 *(uint16_t *)(uintptr_t)(dp + stuff) = csum;
1574 } else {
1575 bcopy((void *)&csum, dp + stuff, sizeof(csum));
1576 }
1577 } else {
1578 _pkt_copy(baddr, dp, len);
1579 }
1580 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
1581 m->m_pkthdr.csum_tx_start = 0;
1582 m->m_pkthdr.csum_tx_stuff = 0;
1583 m->m_pkthdr.csum_flags |= _convert_pkt_csum_flags(pkt->pkt_csum_flags);
1584
1585 /* translate packet metadata */
1586 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
1587 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
1588 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
1589 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
1590 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
1591 m->m_pkthdr.tso_segsz = pkt->pkt_proto_seg_sz;
1592 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
1593 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1594 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1595 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
1596 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
1597 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
1598 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
1599 }
1600 if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
1601 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
1602 }
1603 if (__improbable(copy_pkt_tx_time != 0 &&
1604 (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) {
1605 struct m_tag *tag = NULL;
1606 tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
1607 sizeof(uint64_t), M_WAITOK, m);
1608 if (tag != NULL) {
1609 m_tag_prepend(m, tag);
1610 *(uint64_t *)tag->m_tag_data = pkt->pkt_com_opt->__po_pkt_tx_time;
1611 }
1612 }
1613 m->m_pkthdr.necp_mtag.necp_policy_id = pkt->pkt_policy_id;
1614 m->m_pkthdr.necp_mtag.necp_skip_policy_id = pkt->pkt_skip_policy_id;
1615
1616 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1617 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1618 sk_proc_name_address(current_proc()),
1619 sk_proc_pid(current_proc()), len,
1620 (copysum ? (len - start) : 0), csum, start);
1621 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1622 " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u",
1623 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1624 (uint32_t)pkt->pkt_csum_tx_start_off,
1625 (uint32_t)pkt->pkt_csum_tx_stuff_off);
1626 break;
1627
1628 default:
1629 VERIFY(0);
1630 /* NOTREACHED */
1631 __builtin_unreachable();
1632 }
1633
1634 if (pkt->pkt_link_flags & PKT_LINKF_BCAST) {
1635 m->m_flags |= M_BCAST;
1636 } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) {
1637 m->m_flags |= M_MCAST;
1638 }
1639 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1640 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1641 (t == NR_RX) ? "RX" : "TX",
1642 sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0));
1643 }
1644
1645 /*
1646 * This is a multi-buflet variant of pkt_copy_to_mbuf().
1647 * NOTE: poff is the offset within the packet.
1648 *
1649 * This routine supports copying into an mbuf chain for RX but not TX.
1650 *
1651 * start/stuff is relative to poff, within [0, len], such that
1652 * [ 0 ... poff ... poff + start/stuff ... poff + len ... ]
1653 */
1654 void
pkt_copy_multi_buflet_to_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1655 pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph,
1656 const uint16_t poff, struct mbuf *m, const uint16_t moff,
1657 const uint32_t len, const boolean_t copysum, const uint16_t start)
1658 {
1659 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1660 struct mbuf *curr_m;
1661 uint32_t partial = 0;
1662 uint32_t remaining_len = len, copied_len = 0;
1663 uint16_t csum = 0;
1664 uint8_t *baddr;
1665 uint8_t *dp;
1666 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt);
1667
1668 ASSERT(len >= start);
1669 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1670
1671 /* get buffer address from packet */
1672 MD_BUFLET_ADDR_ABS(pkt, baddr);
1673 ASSERT(baddr != NULL);
1674 baddr += poff;
1675 VERIFY((poff + len) <= (PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp) *
1676 __packet_get_buflet_count(ph)));
1677
1678 ASSERT((m->m_flags & M_PKTHDR));
1679 m->m_data += moff;
1680
1681 switch (t) {
1682 case NR_RX:
1683 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
1684 if (__probable(do_sum && start != 0)) {
1685 ASSERT(M_TRAILINGSPACE(m) >= start);
1686 ASSERT(m->m_len == 0);
1687 dp = (uint8_t *)m_mtod_current(m);
1688 _pkt_copy(baddr, dp, start);
1689 remaining_len -= start;
1690 copied_len += start;
1691 m->m_len += start;
1692 m->m_pkthdr.len += start;
1693 }
1694 curr_m = m;
1695 while (curr_m != NULL && remaining_len != 0) {
1696 uint32_t tmp_len = MIN(remaining_len,
1697 (uint32_t)M_TRAILINGSPACE(curr_m));
1698 uint16_t soff = poff + (uint16_t)copied_len;
1699 dp = (uint8_t *)m_mtod_end(curr_m);
1700
1701 if (__probable(do_sum)) {
1702 partial = _pkt_copyaddr_sum(ph, soff,
1703 dp, tmp_len, TRUE, partial, NULL);
1704 } else {
1705 pkt_copyaddr_sum(ph, soff,
1706 dp, tmp_len, FALSE, 0, NULL);
1707 }
1708
1709 curr_m->m_len += tmp_len;
1710 m->m_pkthdr.len += tmp_len;
1711 copied_len += tmp_len;
1712 remaining_len -= tmp_len;
1713 curr_m = curr_m->m_next;
1714 }
1715 ASSERT(remaining_len == 0);
1716
1717 if (__probable(do_sum)) {
1718 csum = __packet_fold_sum(partial);
1719
1720 m->m_pkthdr.csum_flags |=
1721 (CSUM_DATA_VALID | CSUM_PARTIAL);
1722 m->m_pkthdr.csum_rx_start = start;
1723 m->m_pkthdr.csum_rx_val = csum;
1724 } else {
1725 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
1726 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
1727 _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS);
1728 m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS;
1729 if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) {
1730 m->m_pkthdr.csum_flags |= CSUM_PARTIAL;
1731 }
1732 }
1733
1734 m->m_pkthdr.necp_mtag.necp_policy_id = pkt->pkt_policy_id;
1735 m->m_pkthdr.necp_mtag.necp_skip_policy_id = pkt->pkt_skip_policy_id;
1736
1737 /* translate packet metadata */
1738 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1739 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1740
1741 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1742 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1743 sk_proc_name_address(current_proc()),
1744 sk_proc_pid(current_proc()), len,
1745 (copysum ? (len - start) : 0), csum, start);
1746 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1747 " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1748 SK_KVA(m), moff, m->m_pkthdr.csum_flags,
1749 (uint32_t)m->m_pkthdr.csum_rx_start,
1750 (uint32_t)m->m_pkthdr.csum_rx_val);
1751 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1752 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1753 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1754 (uint32_t)pkt->pkt_csum_rx_start_off,
1755 (uint32_t)pkt->pkt_csum_rx_value);
1756 break;
1757 case NR_TX:
1758 dp = (uint8_t *)m_mtod_current(m);
1759 ASSERT(m->m_next == NULL);
1760 VERIFY(((intptr_t)dp - (intptr_t)mbuf_datastart(m)) + len <=
1761 (uint32_t)mbuf_maxlen(m));
1762 m->m_len += len;
1763 m->m_pkthdr.len += len;
1764 VERIFY(m->m_len == m->m_pkthdr.len &&
1765 (uint32_t)m->m_len <= (uint32_t)mbuf_maxlen(m));
1766 if (copysum) {
1767 uint16_t stuff = pkt->pkt_csum_tx_stuff_off;
1768 /*
1769 * Use pkt_copy() to copy the portion up to the
1770 * point where we need to start the checksum, and
1771 * copy the remainder, checksumming as we go.
1772 */
1773 if (__probable(start != 0)) {
1774 _pkt_copy(baddr, dp, start);
1775 }
1776 partial = _pkt_copyaddr_sum(ph, (poff + start),
1777 (dp + start), (len - start), TRUE, 0, NULL);
1778 csum = __packet_fold_sum_final(partial);
1779
1780 /* RFC1122 4.1.3.4: Invert 0 to -0 (for UDP) */
1781 if (csum == 0 &&
1782 (pkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)) {
1783 csum = 0xffff;
1784 }
1785
1786 /* Insert checksum into packet */
1787 ASSERT(stuff <= (len - sizeof(csum)));
1788 if (IS_P2ALIGNED(dp + stuff, sizeof(csum))) {
1789 *(uint16_t *)(uintptr_t)(dp + stuff) = csum;
1790 } else {
1791 bcopy((void *)&csum, dp + stuff, sizeof(csum));
1792 }
1793 } else {
1794 (void) _pkt_copyaddr_sum(ph, poff, dp, len, FALSE, 0, NULL);
1795 }
1796 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
1797 m->m_pkthdr.csum_tx_start = 0;
1798 m->m_pkthdr.csum_tx_stuff = 0;
1799 m->m_pkthdr.csum_flags |= _convert_pkt_csum_flags(pkt->pkt_csum_flags);
1800
1801 /* translate packet metadata */
1802 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
1803 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
1804 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
1805 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
1806 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
1807 m->m_pkthdr.tso_segsz = pkt->pkt_proto_seg_sz;
1808 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
1809 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1810 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1811 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
1812 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
1813 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
1814 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
1815 }
1816 if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
1817 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
1818 }
1819 if (__improbable(copy_pkt_tx_time != 0 &&
1820 (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) {
1821 struct m_tag *tag = NULL;
1822 tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
1823 sizeof(uint64_t), M_WAITOK, m);
1824 if (tag != NULL) {
1825 m_tag_prepend(m, tag);
1826 *(uint64_t *)tag->m_tag_data = pkt->pkt_com_opt->__po_pkt_tx_time;
1827 }
1828 }
1829
1830 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1831 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1832 sk_proc_name_address(current_proc()),
1833 sk_proc_pid(current_proc()), len,
1834 (copysum ? (len - start) : 0), csum, start);
1835 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1836 " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u",
1837 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1838 (uint32_t)pkt->pkt_csum_tx_start_off,
1839 (uint32_t)pkt->pkt_csum_tx_stuff_off);
1840 break;
1841
1842 default:
1843 VERIFY(0);
1844 /* NOTREACHED */
1845 __builtin_unreachable();
1846 }
1847
1848 if (pkt->pkt_link_flags & PKT_LINKF_BCAST) {
1849 m->m_flags |= M_BCAST;
1850 } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) {
1851 m->m_flags |= M_MCAST;
1852 }
1853 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1854 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1855 (t == NR_RX) ? "RX" : "TX",
1856 sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0));
1857 }
1858
1859 /*
1860 * Like m_copydata(), but computes 16-bit sum as the data is copied.
1861 * Caller can provide an initial sum to be folded into the computed
1862 * sum. The accumulated partial sum (32-bit) is returned to caller;
1863 * caller is responsible for further reducing it to 16-bit if needed,
1864 * as well as to perform the final 1's complement on it.
1865 */
1866 uint32_t
m_copydata_sum(struct mbuf * m,int off,int len,void * __sized_by (len)vp,uint32_t initial_sum,boolean_t * odd_start)1867 m_copydata_sum(struct mbuf *m, int off, int len, void *__sized_by(len)vp, uint32_t initial_sum,
1868 boolean_t *odd_start)
1869 {
1870 boolean_t needs_swap, started_on_odd = FALSE;
1871 int off0 = off, len0 = len;
1872 struct mbuf *m0 = m;
1873 uint64_t sum, partial;
1874 unsigned count, odd;
1875 char *cp = vp;
1876
1877 if (__improbable(off < 0 || len < 0)) {
1878 panic("%s: invalid offset %d or len %d", __func__, off, len);
1879 /* NOTREACHED */
1880 __builtin_unreachable();
1881 }
1882
1883 while (off > 0) {
1884 if (__improbable(m == NULL)) {
1885 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1886 __func__, m0, off0, len0);
1887 /* NOTREACHED */
1888 __builtin_unreachable();
1889 }
1890 if (off < m->m_len) {
1891 break;
1892 }
1893 off -= m->m_len;
1894 m = m->m_next;
1895 }
1896
1897 if (odd_start) {
1898 started_on_odd = *odd_start;
1899 }
1900 sum = initial_sum;
1901
1902 for (; len0 > 0; m = m->m_next) {
1903 uint8_t *datap;
1904
1905 if (__improbable(m == NULL)) {
1906 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1907 __func__, m0, off0, len);
1908 /* NOTREACHED */
1909 __builtin_unreachable();
1910 }
1911
1912 datap = mtod(m, uint8_t *) + off;
1913 count = m->m_len;
1914
1915 if (__improbable(count == 0)) {
1916 continue;
1917 }
1918
1919 count = MIN(count - off, (unsigned)len0);
1920 partial = 0;
1921
1922 if ((uintptr_t)datap & 1) {
1923 /* Align on word boundary */
1924 started_on_odd = !started_on_odd;
1925 #if BYTE_ORDER == LITTLE_ENDIAN
1926 partial = *datap << 8;
1927 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1928 partial = *datap;
1929 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1930 *cp++ = *datap++;
1931 count -= 1;
1932 len0 -= 1;
1933 }
1934
1935 needs_swap = started_on_odd;
1936 odd = count & 1u;
1937 count -= odd;
1938
1939 if (count) {
1940 partial = __packet_copy_and_sum(datap,
1941 cp, count, (uint32_t)partial);
1942 datap += count;
1943 cp += count;
1944 len0 -= count;
1945 if (__improbable((partial & (3ULL << 62)) != 0)) {
1946 if (needs_swap) {
1947 partial = (partial << 8) +
1948 (partial >> 56);
1949 }
1950 sum += (partial >> 32);
1951 sum += (partial & 0xffffffff);
1952 partial = 0;
1953 }
1954 }
1955
1956 if (odd) {
1957 #if BYTE_ORDER == LITTLE_ENDIAN
1958 partial += *datap;
1959 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1960 partial += *datap << 8;
1961 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1962 *cp++ = *datap++;
1963 len0 -= 1;
1964 started_on_odd = !started_on_odd;
1965 }
1966 off = 0;
1967
1968 if (needs_swap) {
1969 partial = (partial << 8) + (partial >> 24);
1970 }
1971 sum += (partial >> 32) + (partial & 0xffffffff);
1972 /*
1973 * Reduce sum to allow potential byte swap
1974 * in the next iteration without carry.
1975 */
1976 sum = (sum >> 32) + (sum & 0xffffffff);
1977 }
1978
1979 if (odd_start) {
1980 *odd_start = started_on_odd;
1981 }
1982
1983 /* Final fold (reduce 64-bit to 32-bit) */
1984 sum = (sum >> 32) + (sum & 0xffffffff); /* 33-bit */
1985 sum = (sum >> 16) + (sum & 0xffff); /* 17-bit + carry */
1986
1987 /* return 32-bit partial sum to caller */
1988 return (uint32_t)sum;
1989 }
1990
1991 #if DEBUG || DEVELOPMENT
1992 #define TRAILERS_MAX 16 /* max trailing bytes */
1993 #define TRAILERS_REGEN (64 * 1024) /* regeneration threshold */
1994 static uint8_t tb[TRAILERS_MAX]; /* random trailing bytes */
1995 static uint32_t regen = TRAILERS_REGEN; /* regeneration counter */
1996
1997 uint32_t
pkt_add_trailers(kern_packet_t ph,const uint32_t len,const uint16_t start)1998 pkt_add_trailers(kern_packet_t ph, const uint32_t len, const uint16_t start)
1999 {
2000 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
2001 uint32_t extra;
2002 uint8_t *baddr;
2003
2004 /* get buffer address from packet */
2005 MD_BUFLET_ADDR_ABS(pkt, baddr);
2006 ASSERT(baddr != NULL);
2007 ASSERT(len <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
2008
2009 extra = MIN((uint32_t)pkt_trailers, (uint32_t)TRAILERS_MAX);
2010 if (extra == 0 || extra > sizeof(tb) ||
2011 (len + extra) > PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp)) {
2012 return 0;
2013 }
2014
2015 /* generate random bytes once per TRAILERS_REGEN packets (approx.) */
2016 if (regen++ == TRAILERS_REGEN) {
2017 read_frandom(&tb[0], sizeof(tb));
2018 regen = 0;
2019 }
2020
2021 bcopy(&tb[0], (baddr + len), extra);
2022
2023 /* recompute partial sum (also to exercise related logic) */
2024 pkt->pkt_csum_flags |= PACKET_CSUM_PARTIAL;
2025 pkt->pkt_csum_rx_value = (uint16_t)__packet_cksum((baddr + start),
2026 ((len + extra) - start), 0);
2027 pkt->pkt_csum_rx_start_off = start;
2028
2029 return extra;
2030 }
2031
2032 uint32_t
pkt_add_trailers_mbuf(struct mbuf * m,const uint16_t start)2033 pkt_add_trailers_mbuf(struct mbuf *m, const uint16_t start)
2034 {
2035 uint32_t extra;
2036
2037 extra = MIN((uint32_t)pkt_trailers, (uint32_t)TRAILERS_MAX);
2038 if (extra == 0 || extra > sizeof(tb)) {
2039 return 0;
2040 }
2041
2042 if (mbuf_copyback(m, m_pktlen(m), extra, &tb[0], M_NOWAIT) != 0) {
2043 return 0;
2044 }
2045
2046 /* generate random bytes once per TRAILERS_REGEN packets (approx.) */
2047 if (regen++ == TRAILERS_REGEN) {
2048 read_frandom(&tb[0], sizeof(tb));
2049 regen = 0;
2050 }
2051
2052 /* recompute partial sum (also to exercise related logic) */
2053 m->m_pkthdr.csum_rx_val = m_sum16(m, start, (m_pktlen(m) - start));
2054 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2055 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
2056 m->m_pkthdr.csum_rx_start = start;
2057
2058 return extra;
2059 }
2060 #endif /* DEBUG || DEVELOPMENT */
2061
2062 void
pkt_copypkt_sum(kern_packet_t sph,uint16_t soff,kern_packet_t dph,uint16_t doff,uint16_t len,uint32_t * partial,boolean_t do_csum)2063 pkt_copypkt_sum(kern_packet_t sph, uint16_t soff, kern_packet_t dph,
2064 uint16_t doff, uint16_t len, uint32_t *partial, boolean_t do_csum)
2065 {
2066 VERIFY(_pkt_copypkt_sum(sph, soff, dph, doff, len, partial, do_csum));
2067 }
2068
2069 uint32_t
pkt_copyaddr_sum(kern_packet_t sph,uint16_t soff,uint8_t * __sized_by (len)dbaddr,uint32_t len,boolean_t do_csum,uint32_t initial_sum,boolean_t * odd_start)2070 pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbaddr,
2071 uint32_t len, boolean_t do_csum, uint32_t initial_sum, boolean_t *odd_start)
2072 {
2073 return _pkt_copyaddr_sum(sph, soff, dbaddr, len, do_csum, initial_sum, odd_start);
2074 }
2075
2076 uint32_t
pkt_mcopypkt_sum(mbuf_t m,int soff,kern_packet_t dph,uint16_t doff,uint16_t len,boolean_t do_cscum)2077 pkt_mcopypkt_sum(mbuf_t m, int soff, kern_packet_t dph, uint16_t doff,
2078 uint16_t len, boolean_t do_cscum)
2079 {
2080 return m_copypkt_sum(m, soff, dph, doff, len, do_cscum);
2081 }
2082
2083 void
pkt_copy(void * __sized_by (len)src,void * __sized_by (len)dst,size_t len)2084 pkt_copy(void *__sized_by(len)src, void *__sized_by(len)dst, size_t len)
2085 {
2086 return _pkt_copy(src, dst, len);
2087 }
2088