1 /*
2 * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <machine/endian.h>
31 #include <net/necp.h>
32
33 uint32_t copy_pkt_tx_time = 1;
34 #if (DEVELOPMENT || DEBUG)
35 SYSCTL_NODE(_kern_skywalk, OID_AUTO, packet,
36 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk packet");
37 int pkt_trailers = 0; /* for testing trailing bytes */
38 SYSCTL_INT(_kern_skywalk_packet, OID_AUTO, trailers,
39 CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_trailers, 0, "");
40
41 SYSCTL_UINT(_kern_skywalk_packet, OID_AUTO, copy_pkt_tx_time,
42 CTLFLAG_RW | CTLFLAG_LOCKED, ©_pkt_tx_time, 0,
43 "copy tx time from pkt to mbuf");
44 #endif /* !DEVELOPMENT && !DEBUG */
45
46
47 __attribute__((always_inline))
48 static inline void
_pkt_copy(void * __sized_by (len)src,void * __sized_by (len)dst,size_t len)49 _pkt_copy(void *__sized_by(len)src, void *__sized_by(len)dst, size_t len)
50 {
51 if (__probable(IS_P2ALIGNED(src, 8) && IS_P2ALIGNED(dst, 8))) {
52 switch (len) {
53 case 20: /* standard IPv4 header */
54 sk_copy64_20(src, dst);
55 return;
56
57 case 40: /* IPv6 header */
58 sk_copy64_40(src, dst);
59 return;
60
61 default:
62 if (IS_P2ALIGNED(len, 64)) {
63 sk_copy64_64x(src, dst, len);
64 return;
65 } else if (IS_P2ALIGNED(len, 32)) {
66 sk_copy64_32x(src, dst, len);
67 return;
68 } else if (IS_P2ALIGNED(len, 8)) {
69 sk_copy64_8x(src, dst, len);
70 return;
71 } else if (IS_P2ALIGNED(len, 4)) {
72 sk_copy64_4x(src, dst, len);
73 return;
74 }
75 break;
76 }
77 }
78 bcopy(src, dst, len);
79 }
80
81 /*
82 * This routine is used for copying data across two kernel packets.
83 * Can also optionally compute 16-bit partial inet checksum as the
84 * data is copied.
85 * This routine is used by flowswitch while copying packet from vp
86 * adapter pool to packet in native netif pool and vice-a-versa.
87 *
88 * start/stuff is relative to soff, within [0, len], such that
89 * [ 0 ... soff ... soff + start/stuff ... soff + len ... ]
90 */
91 void
pkt_copy_from_pkt(const enum txrx t,kern_packet_t dph,const uint16_t doff,kern_packet_t sph,const uint16_t soff,const uint32_t len,const boolean_t copysum,const uint16_t start,const uint16_t stuff,const boolean_t invert)92 pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff,
93 kern_packet_t sph, const uint16_t soff, const uint32_t len,
94 const boolean_t copysum, const uint16_t start, const uint16_t stuff,
95 const boolean_t invert)
96 {
97 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
98 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
99 uint32_t partial;
100 uint16_t csum = 0;
101 uint8_t *sbaddr, *dbaddr;
102 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
103
104 _CASSERT(sizeof(csum) == sizeof(uint16_t));
105
106 /* get buffer address from packet */
107 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
108 ASSERT(sbaddr != NULL);
109 sbaddr += soff;
110 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
111 ASSERT(dbaddr != NULL);
112 dbaddr += doff;
113 VERIFY((doff + len) <= PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
114
115 switch (t) {
116 case NR_RX:
117 dpkt->pkt_csum_flags = 0;
118 if (__probable(do_sum)) {
119 /*
120 * Use pkt_copy() to copy the portion up to the
121 * point where we need to start the checksum, and
122 * copy the remainder, checksumming as we go.
123 */
124 if (__probable(start != 0)) {
125 _pkt_copy(sbaddr, dbaddr, start);
126 }
127 partial = __packet_copy_and_sum((sbaddr + start),
128 (dbaddr + start), (len - start), 0);
129 csum = __packet_fold_sum(partial);
130
131 __packet_set_inet_checksum(dph, PACKET_CSUM_PARTIAL,
132 start, csum, FALSE);
133 } else {
134 _pkt_copy(sbaddr, dbaddr, len);
135 dpkt->pkt_csum_rx_start_off = spkt->pkt_csum_rx_start_off;
136 dpkt->pkt_csum_rx_value = spkt->pkt_csum_rx_value;
137 dpkt->pkt_csum_flags |= spkt->pkt_csum_flags & PACKET_CSUM_RX_FLAGS;
138 }
139
140 SK_DF(SK_VERB_COPY | SK_VERB_RX,
141 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
142 sk_proc_name_address(current_proc()),
143 sk_proc_pid(current_proc()), len,
144 (copysum ? (len - start) : 0), csum, start);
145 SK_DF(SK_VERB_COPY | SK_VERB_RX,
146 " pkt 0x%llx doff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
147 SK_KVA(dpkt), doff, dpkt->pkt_csum_flags,
148 (uint32_t)dpkt->pkt_csum_rx_start_off,
149 (uint32_t)dpkt->pkt_csum_rx_value);
150 break;
151
152 case NR_TX:
153 if (copysum) {
154 /*
155 * Use pkt_copy() to copy the portion up to the
156 * point where we need to start the checksum, and
157 * copy the remainder, checksumming as we go.
158 */
159 if (__probable(start != 0)) {
160 _pkt_copy(sbaddr, dbaddr, start);
161 }
162 partial = __packet_copy_and_sum((sbaddr + start),
163 (dbaddr + start), (len - start), 0);
164 csum = __packet_fold_sum_final(partial);
165
166 /* RFC1122 4.1.3.4: Invert 0 to -0 for UDP */
167 if (csum == 0 && invert) {
168 csum = 0xffff;
169 }
170
171 /* Insert checksum into packet */
172 ASSERT(stuff <= (len - sizeof(csum)));
173 if (IS_P2ALIGNED(dbaddr + stuff, sizeof(csum))) {
174 *(uint16_t *)(uintptr_t)(dbaddr + stuff) = csum;
175 } else {
176 bcopy((void *)&csum, dbaddr + stuff,
177 sizeof(csum));
178 }
179 } else {
180 _pkt_copy(sbaddr, dbaddr, len);
181 }
182 dpkt->pkt_csum_flags = spkt->pkt_csum_flags &
183 (PACKET_CSUM_TSO_FLAGS | PACKET_TX_CSUM_OFFLOAD_FLAGS);
184 dpkt->pkt_csum_tx_start_off = 0;
185 dpkt->pkt_csum_tx_stuff_off = 0;
186
187 SK_DF(SK_VERB_COPY | SK_VERB_TX,
188 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u",
189 sk_proc_name_address(current_proc()),
190 sk_proc_pid(current_proc()), len,
191 (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags);
192 break;
193
194 default:
195 VERIFY(0);
196 /* NOTREACHED */
197 __builtin_unreachable();
198 }
199 METADATA_ADJUST_LEN(dpkt, len, doff);
200
201 SK_DF(SK_VERB_COPY | SK_VERB_DUMP, "%s(%d) %s %s",
202 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
203 (t == NR_RX) ? "RX" : "TX",
204 sk_dump("buf", dbaddr, len, 128, NULL, 0));
205 }
206
207 /*
208 * NOTE: soff is the offset within the packet
209 * The accumulated partial sum (32-bit) is returned to caller in csum_partial;
210 * caller is responsible for further reducing it to 16-bit if needed,
211 * as well as to perform the final 1's complement on it.
212 */
213 uint32_t static inline
_pkt_copyaddr_sum(kern_packet_t sph,uint16_t soff,uint8_t * __sized_by (len)dbaddr,uint32_t len,boolean_t do_csum,uint32_t initial_sum,boolean_t * odd_start)214 _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbaddr,
215 uint32_t len, boolean_t do_csum, uint32_t initial_sum, boolean_t *odd_start)
216 {
217 uint8_t odd = 0;
218 uint8_t *sbaddr = NULL;
219 uint32_t sum = initial_sum, partial;
220 uint32_t len0 = len;
221 boolean_t needs_swap, started_on_odd = FALSE;
222 uint16_t sbcnt, off0 = soff;
223 uint32_t clen, sboff, sblen;
224 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
225 kern_buflet_t sbuf = NULL, sbufp = NULL;
226
227 sbcnt = __packet_get_buflet_count(sph);
228
229 if (odd_start) {
230 started_on_odd = *odd_start;
231 }
232
233 /* fastpath (copy+sum, single buflet, even aligned, even length) */
234 if (do_csum && sbcnt == 1 && len != 0) {
235 PKT_GET_NEXT_BUFLET(spkt, 1, sbufp, sbuf);
236 ASSERT(sbuf != NULL);
237 sboff = __buflet_get_data_offset(sbuf);
238 sblen = __buflet_get_data_length(sbuf);
239 ASSERT(sboff <= soff);
240 ASSERT(soff < sboff + sblen);
241 sblen -= (soff - sboff);
242 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
243
244 clen = (uint16_t)MIN(len, sblen);
245
246 if (((uintptr_t)sbaddr & 1) == 0 && clen && (clen & 1) == 0) {
247 sum = __packet_copy_and_sum(sbaddr, dbaddr, clen, sum);
248 return __packet_fold_sum(sum);
249 }
250
251 sbaddr = NULL;
252 sbuf = sbufp = NULL;
253 }
254
255 while (len != 0) {
256 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
257 if (__improbable(sbuf == NULL)) {
258 panic("%s: bad packet, 0x%llx [off %d, len %d]",
259 __func__, SK_KVA(spkt), off0, len0);
260 /* NOTREACHED */
261 __builtin_unreachable();
262 }
263 sbufp = sbuf;
264 sboff = __buflet_get_data_offset(sbuf);
265 sblen = __buflet_get_data_length(sbuf);
266 ASSERT((sboff <= soff) && (soff < sboff + sblen));
267 sblen -= (soff - sboff);
268 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
269 soff = 0;
270 clen = (uint16_t)MIN(len, sblen);
271 if (__probable(do_csum)) {
272 partial = 0;
273 if (__improbable((uintptr_t)sbaddr & 1)) {
274 /* Align on word boundary */
275 started_on_odd = !started_on_odd;
276 #if BYTE_ORDER == LITTLE_ENDIAN
277 partial = (uint8_t)*sbaddr << 8;
278 #else /* BYTE_ORDER != LITTLE_ENDIAN */
279 partial = (uint8_t)*sbaddr;
280 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
281 /*
282 * -fbounds-safety: *dbaddr++ = *sbaddr++ fails
283 * to compile. But the following works. Also,
284 * grouping dbaddr and len updates led to higher
285 * throughput performance, compared to doing
286 * dbaddr++; sbaddr++; len -= 1; in that order.
287 */
288 *dbaddr = *sbaddr;
289 dbaddr++;
290 sblen -= 1;
291 clen -= 1;
292 len -= 1;
293 sbaddr++;
294 }
295 needs_swap = started_on_odd;
296
297 odd = clen & 1u;
298 clen -= odd;
299
300 if (clen != 0) {
301 partial = __packet_copy_and_sum(sbaddr, dbaddr,
302 clen, partial);
303 }
304
305 if (__improbable(partial & 0xc0000000)) {
306 if (needs_swap) {
307 partial = (partial << 8) +
308 (partial >> 24);
309 }
310 sum += (partial >> 16);
311 sum += (partial & 0xffff);
312 partial = 0;
313 }
314 } else {
315 _pkt_copy(sbaddr, dbaddr, clen);
316 }
317
318 dbaddr += clen;
319
320 /*
321 * -fbounds-safety: the following 3 lines were moved up from
322 * after the if-block. None of these are modified in the
323 * if-block, so moving these up here shouldn't change the
324 * behavior. Also, updating len before updating sbaddr led to
325 * faster throughput than doing: dbaddr += clen; sbaddr += clen;
326 * len -= clen + odd;
327 */
328 sblen -= clen + odd;
329 len -= clen + odd;
330 ASSERT(sblen == 0 || len == 0);
331
332 sbaddr += clen;
333
334 if (__probable(do_csum)) {
335 if (odd != 0) {
336 #if BYTE_ORDER == LITTLE_ENDIAN
337 partial += (uint8_t)*sbaddr;
338 #else /* BYTE_ORDER != LITTLE_ENDIAN */
339 partial += (uint8_t)*sbaddr << 8;
340 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
341 *dbaddr++ = *sbaddr++;
342 started_on_odd = !started_on_odd;
343 }
344
345 if (needs_swap) {
346 partial = (partial << 8) + (partial >> 24);
347 }
348 sum += (partial >> 16) + (partial & 0xffff);
349 /*
350 * Reduce sum to allow potential byte swap
351 * in the next iteration without carry.
352 */
353 sum = (sum >> 16) + (sum & 0xffff);
354 }
355 }
356
357 if (odd_start) {
358 *odd_start = started_on_odd;
359 }
360
361 if (__probable(do_csum)) {
362 /* Final fold (reduce 32-bit to 16-bit) */
363 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
364 sum = (sum >> 16) + (sum & 0xffff);
365 }
366 return sum;
367 }
368
369 /*
370 * NOTE: Caller of this function is responsible to adjust the length and offset
371 * of the first buflet of the destination packet if (doff != 0),
372 * i.e. additional data is being prependend to the packet.
373 * It should also finalize the packet.
374 * To simplify & optimize the routine, we have also assumed that soff & doff
375 * will lie within the first buffer, which is true for the current use cases
376 * where, doff is the offset of the checksum field in the TCP/IP header and
377 * soff is the L3 offset.
378 * The accumulated partial sum (32-bit) is returned to caller in csum_partial;
379 * caller is responsible for further reducing it to 16-bit if needed,
380 * as well as to perform the final 1's complement on it.
381 */
382 static inline boolean_t
_pkt_copypkt_sum(kern_packet_t sph,uint16_t soff,kern_packet_t dph,uint16_t doff,uint32_t len,uint32_t * csum_partial,boolean_t do_csum)383 _pkt_copypkt_sum(kern_packet_t sph, uint16_t soff, kern_packet_t dph,
384 uint16_t doff, uint32_t len, uint32_t *csum_partial, boolean_t do_csum)
385 {
386 uint8_t odd = 0;
387 uint32_t sum = 0, partial;
388 boolean_t needs_swap, started_on_odd = FALSE;
389 uint8_t *sbaddr = NULL, *dbaddr = NULL;
390 uint16_t sbcnt, dbcnt;
391 uint32_t clen, dlen0, sboff, sblen, dlim;
392 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
393 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
394 kern_buflet_t sbuf = NULL, sbufp = NULL, dbuf = NULL, dbufp = NULL;
395
396 ASSERT(csum_partial != NULL || !do_csum);
397 sbcnt = __packet_get_buflet_count(sph);
398 dbcnt = __packet_get_buflet_count(dph);
399
400 while (len != 0) {
401 ASSERT(sbaddr == NULL || dbaddr == NULL);
402 if (sbaddr == NULL) {
403 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
404 if (__improbable(sbuf == NULL)) {
405 break;
406 }
407 sbufp = sbuf;
408 sblen = __buflet_get_data_length(sbuf);
409 sboff = __buflet_get_data_offset(sbuf);
410 ASSERT(soff >= sboff);
411 ASSERT(sboff + sblen > soff);
412 sblen -= (soff - sboff);
413 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
414 soff = 0;
415 }
416
417 if (dbaddr == NULL) {
418 if (dbufp != NULL) {
419 __buflet_set_data_length(dbufp, dlen0);
420 }
421
422 PKT_GET_NEXT_BUFLET(dpkt, dbcnt, dbufp, dbuf);
423 if (__improbable(dbuf == NULL)) {
424 break;
425 }
426 dbufp = dbuf;
427 dlim = __buflet_get_data_limit(dbuf);
428 ASSERT(dlim > doff);
429 dlim -= doff;
430 if (doff != 0) {
431 VERIFY(__buflet_set_data_offset(dbuf, doff) == 0);
432 }
433 dbaddr = (uint8_t *)__buflet_get_data_address(dbuf) + doff;
434 dlen0 = dlim;
435 doff = 0;
436 }
437
438 clen = MIN(len, sblen);
439 clen = MIN(clen, dlim);
440
441 if (__probable(do_csum)) {
442 partial = 0;
443 if (__improbable((uintptr_t)sbaddr & 1)) {
444 /* Align on word boundary */
445 started_on_odd = !started_on_odd;
446 #if BYTE_ORDER == LITTLE_ENDIAN
447 partial = (uint8_t)*sbaddr << 8;
448 #else /* BYTE_ORDER != LITTLE_ENDIAN */
449 partial = (uint8_t)*sbaddr;
450 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
451 *dbaddr++ = *sbaddr++;
452 clen -= 1;
453 dlim -= 1;
454 len -= 1;
455 }
456 needs_swap = started_on_odd;
457
458 odd = clen & 1u;
459 clen -= odd;
460
461 if (clen != 0) {
462 partial = __packet_copy_and_sum(sbaddr, dbaddr,
463 clen, partial);
464 }
465
466 if (__improbable(partial & 0xc0000000)) {
467 if (needs_swap) {
468 partial = (partial << 8) +
469 (partial >> 24);
470 }
471 sum += (partial >> 16);
472 sum += (partial & 0xffff);
473 partial = 0;
474 }
475 } else {
476 _pkt_copy(sbaddr, dbaddr, clen);
477 }
478 sbaddr += clen;
479 dbaddr += clen;
480
481 if (__probable(do_csum)) {
482 if (odd != 0) {
483 #if BYTE_ORDER == LITTLE_ENDIAN
484 partial += (uint8_t)*sbaddr;
485 #else /* BYTE_ORDER != LITTLE_ENDIAN */
486 partial += (uint8_t)*sbaddr << 8;
487 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
488 *dbaddr++ = *sbaddr++;
489 started_on_odd = !started_on_odd;
490 }
491
492 if (needs_swap) {
493 partial = (partial << 8) + (partial >> 24);
494 }
495 sum += (partial >> 16) + (partial & 0xffff);
496 /*
497 * Reduce sum to allow potential byte swap
498 * in the next iteration without carry.
499 */
500 sum = (sum >> 16) + (sum & 0xffff);
501 }
502
503 sblen -= clen + odd;
504 dlim -= clen + odd;
505 len -= clen + odd;
506
507 if (sblen == 0) {
508 sbaddr = NULL;
509 }
510
511 if (dlim == 0) {
512 dbaddr = NULL;
513 }
514 }
515
516 if (__probable(dbuf != NULL)) {
517 __buflet_set_data_length(dbuf, (dlen0 - dlim));
518 }
519 if (__probable(do_csum)) {
520 /* Final fold (reduce 32-bit to 16-bit) */
521 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
522 sum = (sum >> 16) + (sum & 0xffff);
523 *csum_partial = (uint32_t)sum;
524 }
525 return len == 0;
526 }
527
528 uint32_t
pkt_sum(kern_packet_t sph,uint16_t soff,uint16_t len)529 pkt_sum(kern_packet_t sph, uint16_t soff, uint16_t len)
530 {
531 uint8_t odd = 0;
532 uint32_t sum = 0, partial;
533 boolean_t needs_swap, started_on_odd = FALSE;
534 uint8_t *sbaddr = NULL;
535 uint16_t sbcnt;
536 uint32_t clen, sblen, sboff;
537 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
538 kern_buflet_t sbuf = NULL, sbufp = NULL;
539
540 sbcnt = __packet_get_buflet_count(sph);
541
542 /* fastpath (single buflet, even aligned, even length) */
543 if (sbcnt == 1 && len != 0) {
544 PKT_GET_NEXT_BUFLET(spkt, 1, sbufp, sbuf);
545 ASSERT(sbuf != NULL);
546 sblen = __buflet_get_data_length(sbuf);
547 sboff = __buflet_get_data_offset(sbuf);
548 ASSERT(soff >= sboff);
549 ASSERT(sboff + sblen > soff);
550 sblen -= (soff - sboff);
551 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
552
553 clen = MIN(len, sblen);
554
555 if (((uintptr_t)sbaddr & 1) == 0 && clen && (clen & 1) == 0) {
556 sum = __packet_cksum(sbaddr, clen, 0);
557 return __packet_fold_sum(sum);
558 }
559
560 sbaddr = NULL;
561 sbuf = sbufp = NULL;
562 }
563
564 /* slowpath */
565 while (len != 0) {
566 ASSERT(sbaddr == NULL);
567 if (sbaddr == NULL) {
568 PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf);
569 if (__improbable(sbuf == NULL)) {
570 break;
571 }
572 sbufp = sbuf;
573 sblen = __buflet_get_data_length(sbuf);
574 sboff = __buflet_get_data_offset(sbuf);
575 ASSERT(soff >= sboff);
576 ASSERT(sboff + sblen > soff);
577 sblen -= (soff - sboff);
578 sbaddr = (uint8_t *)__buflet_get_data_address(sbuf) + soff;
579 soff = 0;
580 }
581
582 clen = MIN(len, sblen);
583
584 partial = 0;
585 if (__improbable((uintptr_t)sbaddr & 1)) {
586 /* Align on word boundary */
587 started_on_odd = !started_on_odd;
588 #if BYTE_ORDER == LITTLE_ENDIAN
589 partial = (uint8_t)*sbaddr << 8;
590 #else /* BYTE_ORDER != LITTLE_ENDIAN */
591 partial = (uint8_t)*sbaddr;
592 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
593 clen -= 1;
594 len -= 1;
595 }
596 needs_swap = started_on_odd;
597
598 odd = clen & 1u;
599 clen -= odd;
600
601 if (clen != 0) {
602 partial = __packet_cksum(sbaddr,
603 clen, partial);
604 }
605
606 if (__improbable(partial & 0xc0000000)) {
607 if (needs_swap) {
608 partial = (partial << 8) +
609 (partial >> 24);
610 }
611 sum += (partial >> 16);
612 sum += (partial & 0xffff);
613 partial = 0;
614 }
615 sbaddr += clen;
616
617 if (odd != 0) {
618 #if BYTE_ORDER == LITTLE_ENDIAN
619 partial += (uint8_t)*sbaddr;
620 #else /* BYTE_ORDER != LITTLE_ENDIAN */
621 partial += (uint8_t)*sbaddr << 8;
622 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
623 started_on_odd = !started_on_odd;
624 }
625
626 if (needs_swap) {
627 partial = (partial << 8) + (partial >> 24);
628 }
629 sum += (partial >> 16) + (partial & 0xffff);
630 /*
631 * Reduce sum to allow potential byte swap
632 * in the next iteration without carry.
633 */
634 sum = (sum >> 16) + (sum & 0xffff);
635
636 sblen -= clen + odd;
637 len -= clen + odd;
638
639 if (sblen == 0) {
640 sbaddr = NULL;
641 }
642 }
643
644 /* Final fold (reduce 32-bit to 16-bit) */
645 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
646 sum = (sum >> 16) + (sum & 0xffff);
647 return (uint32_t)sum;
648 }
649
650
651 /*
652 * This is a multi-buflet variant of pkt_copy_from_pkt().
653 *
654 * start/stuff is relative to soff, within [0, len], such that
655 * [ 0 ... soff ... soff + start/stuff ... soff + len ... ]
656 */
657 void
pkt_copy_multi_buflet_from_pkt(const enum txrx t,kern_packet_t dph,const uint16_t doff,kern_packet_t sph,const uint16_t soff,const uint32_t len,const boolean_t copysum,const uint16_t start,const uint16_t stuff,const boolean_t invert)658 pkt_copy_multi_buflet_from_pkt(const enum txrx t, kern_packet_t dph,
659 const uint16_t doff, kern_packet_t sph, const uint16_t soff,
660 const uint32_t len, const boolean_t copysum, const uint16_t start,
661 const uint16_t stuff, const boolean_t invert)
662 {
663 boolean_t rc;
664 uint32_t partial;
665 uint16_t csum = 0;
666 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
667 struct __kern_packet *spkt = SK_PTR_ADDR_KPKT(sph);
668 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
669
670 VERIFY((doff + len) <= (PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp) *
671 __packet_get_buflet_count(dph)));
672
673 switch (t) {
674 case NR_RX:
675 dpkt->pkt_csum_flags = 0;
676 if (__probable(do_sum)) {
677 /*
678 * copy the portion up to the point where we need to
679 * start the checksum, and copy the remainder,
680 * checksumming as we go.
681 */
682 if (__probable(start != 0)) {
683 rc = _pkt_copypkt_sum(sph, soff, dph, doff,
684 start, NULL, FALSE);
685 ASSERT(rc);
686 }
687 _pkt_copypkt_sum(sph, (soff + start), dph,
688 (doff + start), (len - start), &partial, TRUE);
689 csum = __packet_fold_sum(partial);
690 __packet_set_inet_checksum(dph, PACKET_CSUM_PARTIAL,
691 start, csum, FALSE);
692 METADATA_ADJUST_LEN(dpkt, start, doff);
693 } else {
694 rc = _pkt_copypkt_sum(sph, soff, dph, doff, len, NULL,
695 FALSE);
696 ASSERT(rc);
697 dpkt->pkt_csum_rx_start_off = spkt->pkt_csum_rx_start_off;
698 dpkt->pkt_csum_rx_value = spkt->pkt_csum_rx_value;
699 dpkt->pkt_csum_flags |= spkt->pkt_csum_flags & PACKET_CSUM_RX_FLAGS;
700 }
701 break;
702
703 case NR_TX:
704 if (copysum) {
705 uint8_t *baddr;
706 /*
707 * copy the portion up to the point where we need to
708 * start the checksum, and copy the remainder,
709 * checksumming as we go.
710 */
711 if (__probable(start != 0)) {
712 rc = _pkt_copypkt_sum(sph, soff, dph, doff,
713 start, NULL, FALSE);
714 ASSERT(rc);
715 }
716 rc = _pkt_copypkt_sum(sph, (soff + start), dph,
717 (doff + start), (len - start), &partial, TRUE);
718 ASSERT(rc);
719 csum = __packet_fold_sum_final(partial);
720
721 /* RFC1122 4.1.3.4: Invert 0 to -0 for UDP */
722 if (csum == 0 && invert) {
723 csum = 0xffff;
724 }
725
726 /*
727 * Insert checksum into packet.
728 * Here we assume that checksum will be in the
729 * first buffer.
730 */
731 ASSERT((stuff + doff + sizeof(csum)) <=
732 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
733 ASSERT(stuff <= (len - sizeof(csum)));
734
735 /* get first buflet buffer address from packet */
736 MD_BUFLET_ADDR_ABS(dpkt, baddr);
737 ASSERT(baddr != NULL);
738 baddr += doff;
739 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
740 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
741 } else {
742 bcopy((void *)&csum, baddr + stuff,
743 sizeof(csum));
744 }
745 METADATA_ADJUST_LEN(dpkt, start, doff);
746 } else {
747 rc = _pkt_copypkt_sum(sph, soff, dph, doff, len, NULL,
748 FALSE);
749 ASSERT(rc);
750 }
751 dpkt->pkt_csum_flags = spkt->pkt_csum_flags &
752 (PACKET_CSUM_TSO_FLAGS | PACKET_TX_CSUM_OFFLOAD_FLAGS);
753 dpkt->pkt_csum_tx_start_off = 0;
754 dpkt->pkt_csum_tx_stuff_off = 0;
755
756 SK_DF(SK_VERB_COPY | SK_VERB_TX,
757 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u",
758 sk_proc_name_address(current_proc()),
759 sk_proc_pid(current_proc()), len,
760 (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags);
761 break;
762
763 default:
764 VERIFY(0);
765 /* NOTREACHED */
766 __builtin_unreachable();
767 }
768 }
769
770 static inline uint32_t
_convert_mbuf_csum_flags(uint32_t mbuf_flags)771 _convert_mbuf_csum_flags(uint32_t mbuf_flags)
772 {
773 uint32_t pkt_flags = 0;
774
775 if (mbuf_flags & CSUM_TCP) {
776 pkt_flags |= PACKET_CSUM_TCP;
777 }
778 if (mbuf_flags & CSUM_TCPIPV6) {
779 pkt_flags |= PACKET_CSUM_TCPIPV6;
780 }
781 if (mbuf_flags & CSUM_UDP) {
782 pkt_flags |= PACKET_CSUM_UDP;
783 }
784 if (mbuf_flags & CSUM_UDPIPV6) {
785 pkt_flags |= PACKET_CSUM_UDPIPV6;
786 }
787 if (mbuf_flags & CSUM_IP) {
788 pkt_flags |= PACKET_CSUM_IP;
789 }
790 if (mbuf_flags & CSUM_ZERO_INVERT) {
791 pkt_flags |= PACKET_CSUM_ZERO_INVERT;
792 }
793
794 return pkt_flags;
795 }
796
797 /*
798 * This routine is used for copying an mbuf which originated in the host
799 * stack destined to a native skywalk interface (NR_TX), as well as for
800 * mbufs originating on compat network interfaces (NR_RX).
801 *
802 * start/stuff is relative to moff, within [0, len], such that
803 * [ 0 ... moff ... moff + start/stuff ... moff + len ... ]
804 */
805 void
pkt_copy_from_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)806 pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff,
807 struct mbuf *m, const uint16_t moff, const uint32_t len,
808 const boolean_t copysum, const uint16_t start)
809 {
810 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
811 struct m_tag *ts_tag = NULL;
812 uint32_t partial;
813 uint16_t csum = 0;
814 uint8_t *baddr;
815
816 _CASSERT(sizeof(csum) == sizeof(uint16_t));
817
818 /* get buffer address from packet */
819 MD_BUFLET_ADDR_ABS(pkt, baddr);
820 ASSERT(baddr != NULL);
821 baddr += poff;
822 VERIFY((poff + len) <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
823
824 switch (t) {
825 case NR_RX:
826 pkt->pkt_csum_flags = m->m_pkthdr.csum_flags;
827 pkt->pkt_csum_rx_start_off = 0;
828 pkt->pkt_csum_rx_value = m->m_pkthdr.csum_rx_val;
829 pkt->pkt_svc_class = m_get_service_class(m);
830 if (__probable(((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS)
831 != CSUM_RX_FULL_FLAGS) && copysum)) {
832 /*
833 * Use m_copydata() to copy the portion up to the
834 * point where we need to start the checksum, and
835 * copy the remainder, checksumming as we go.
836 */
837 if (start != 0) {
838 m_copydata(m, moff, start, baddr);
839 }
840 partial = m_copydata_sum(m, start, (len - start),
841 (baddr + start), 0, NULL);
842 csum = __packet_fold_sum(partial);
843
844 __packet_set_inet_checksum(ph, PACKET_CSUM_PARTIAL,
845 start, csum, FALSE);
846 } else {
847 m_copydata(m, moff, len, baddr);
848 }
849 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
850 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
851 sk_proc_name_address(current_proc()),
852 sk_proc_pid(current_proc()), len,
853 (copysum ? (len - start) : 0), csum, start);
854 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
855 " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x",
856 SK_KVA(m), m->m_pkthdr.csum_flags,
857 (uint32_t)m->m_pkthdr.csum_rx_start,
858 (uint32_t)m->m_pkthdr.csum_rx_val);
859 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
860 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
861 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
862 (uint32_t)pkt->pkt_csum_rx_start_off,
863 (uint32_t)pkt->pkt_csum_rx_value);
864 break;
865
866 case NR_TX:
867 if (copysum) {
868 uint16_t stuff = m->m_pkthdr.csum_tx_stuff;
869 /*
870 * Use m_copydata() to copy the portion up to the
871 * point where we need to start the checksum, and
872 * copy the remainder, checksumming as we go.
873 */
874 if (start != 0) {
875 m_copydata(m, moff, start, baddr);
876 }
877 partial = m_copydata_sum(m, start, (len - start),
878 (baddr + start), 0, NULL);
879 csum = __packet_fold_sum_final(partial);
880
881 /*
882 * RFC1122 4.1.3.4: Invert 0 to -0 for UDP;
883 * ideally we'd only test for CSUM_ZERO_INVERT
884 * here, but catch cases where the originator
885 * did not set it for UDP.
886 */
887 if (csum == 0 && (m->m_pkthdr.csum_flags &
888 (CSUM_UDP | CSUM_UDPIPV6 | CSUM_ZERO_INVERT))) {
889 csum = 0xffff;
890 }
891
892 /* Insert checksum into packet */
893 ASSERT(stuff <= (len - sizeof(csum)));
894 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
895 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
896 } else {
897 bcopy((void *)&csum, baddr + stuff,
898 sizeof(csum));
899 }
900 } else {
901 m_copydata(m, moff, len, baddr);
902 }
903 pkt->pkt_csum_flags = 0;
904 pkt->pkt_csum_tx_start_off = 0;
905 pkt->pkt_csum_tx_stuff_off = 0;
906
907 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
908 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV4;
909 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
910 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV6) == 0);
911 }
912 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) {
913 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV6;
914 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
915 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV4) == 0);
916 }
917 if (!copysum) {
918 pkt->pkt_csum_flags |= _convert_mbuf_csum_flags(m->m_pkthdr.csum_flags);
919 }
920
921 /* translate mbuf metadata */
922 pkt->pkt_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
923 pkt->pkt_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
924 pkt->pkt_flow_token = m->m_pkthdr.pkt_flowid;
925 pkt->pkt_comp_gencnt = m->m_pkthdr.comp_gencnt;
926 switch (m->m_pkthdr.pkt_proto) {
927 case IPPROTO_QUIC:
928 pkt->pkt_flow_ip_proto = IPPROTO_UDP;
929 pkt->pkt_transport_protocol = IPPROTO_QUIC;
930 break;
931
932 default:
933 pkt->pkt_flow_ip_proto = m->m_pkthdr.pkt_proto;
934 pkt->pkt_transport_protocol = m->m_pkthdr.pkt_proto;
935 break;
936 }
937 (void) mbuf_get_timestamp(m, &pkt->pkt_timestamp, NULL);
938 pkt->pkt_svc_class = m_get_service_class(m);
939 pkt->pkt_pflags &= ~PKT_F_COMMON_MASK;
940 pkt->pkt_pflags |= (m->m_pkthdr.pkt_flags & PKT_F_COMMON_MASK);
941 if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) {
942 pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq);
943 }
944 if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) {
945 pkt->pkt_pflags |= PKT_F_L4S;
946 }
947 necp_get_app_uuid_from_packet(m, pkt->pkt_policy_euuid);
948 pkt->pkt_policy_id =
949 (uint32_t)necp_get_policy_id_from_packet(m);
950 pkt->pkt_skip_policy_id =
951 (uint32_t)necp_get_skip_policy_id_from_packet(m);
952
953 if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) != 0) {
954 if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) != 0) {
955 __packet_set_tx_completion_data(ph,
956 m->m_pkthdr.drv_tx_compl_arg,
957 m->m_pkthdr.drv_tx_compl_data);
958 }
959 pkt->pkt_tx_compl_context =
960 m->m_pkthdr.pkt_compl_context;
961 pkt->pkt_tx_compl_callbacks =
962 m->m_pkthdr.pkt_compl_callbacks;
963 /*
964 * Remove PKTF_TX_COMPL_TS_REQ flag so that this
965 * mbuf can no longer trigger a completion callback.
966 * callback will be invoked when the kernel packet is
967 * completed.
968 */
969 m->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
970
971 m_add_crumb(m, PKT_CRUMB_SK_PKT_COPY);
972 }
973
974 ts_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM);
975 if (ts_tag != NULL) {
976 __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data));
977 }
978
979 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
980 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
981 sk_proc_name_address(current_proc()),
982 sk_proc_pid(current_proc()), len,
983 (copysum ? (len - start) : 0), csum, start);
984 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
985 " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u",
986 SK_KVA(m), m->m_pkthdr.csum_flags,
987 (uint32_t)m->m_pkthdr.csum_tx_start,
988 (uint32_t)m->m_pkthdr.csum_tx_stuff);
989 break;
990
991 default:
992 VERIFY(0);
993 /* NOTREACHED */
994 __builtin_unreachable();
995 }
996 METADATA_ADJUST_LEN(pkt, len, poff);
997
998 if (m->m_flags & M_BCAST) {
999 __packet_set_link_broadcast(ph);
1000 } else if (m->m_flags & M_MCAST) {
1001 __packet_set_link_multicast(ph);
1002 }
1003
1004 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1005 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1006 (t == NR_RX) ? "RX" : "TX",
1007 sk_dump("buf", baddr, len, 128, NULL, 0));
1008 }
1009
1010 /*
1011 * Like m_copydata_sum(), but works on a destination kernel packet.
1012 */
1013 static inline uint32_t
m_copypkt_sum(mbuf_t m,int soff,kern_packet_t dph,uint16_t doff,uint32_t len,boolean_t do_cscum)1014 m_copypkt_sum(mbuf_t m, int soff, kern_packet_t dph, uint16_t doff,
1015 uint32_t len, boolean_t do_cscum)
1016 {
1017 boolean_t needs_swap, started_on_odd = FALSE;
1018 int off0 = soff;
1019 uint32_t len0 = len;
1020 struct mbuf *m0 = m;
1021 uint32_t sum = 0, partial;
1022 unsigned count0, count, odd, mlen_copied;
1023 uint8_t *sbaddr = NULL, *dbaddr = NULL;
1024 uint16_t dbcnt = __packet_get_buflet_count(dph);
1025 uint32_t dlim, dlen0;
1026 struct __kern_packet *dpkt = SK_PTR_ADDR_KPKT(dph);
1027 kern_buflet_t dbuf = NULL, dbufp = NULL;
1028
1029 while (soff > 0) {
1030 if (__improbable(m == NULL)) {
1031 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1032 __func__, m0, off0, len0);
1033 /* NOTREACHED */
1034 __builtin_unreachable();
1035 }
1036 if (soff < m->m_len) {
1037 break;
1038 }
1039 soff -= m->m_len;
1040 m = m->m_next;
1041 }
1042
1043 if (__improbable(m == NULL)) {
1044 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1045 __func__, m0, off0, len0);
1046 /* NOTREACHED */
1047 __builtin_unreachable();
1048 }
1049
1050 sbaddr = mtod(m, uint8_t *) + soff;
1051 count = m->m_len - soff;
1052 mlen_copied = 0;
1053
1054 while (len != 0) {
1055 ASSERT(sbaddr == NULL || dbaddr == NULL);
1056 if (sbaddr == NULL) {
1057 soff = 0;
1058 m = m->m_next;
1059 if (__improbable(m == NULL)) {
1060 panic("%s: invalid mbuf chain %p [off %d, "
1061 "len %d]", __func__, m0, off0, len0);
1062 /* NOTREACHED */
1063 __builtin_unreachable();
1064 }
1065 sbaddr = mtod(m, uint8_t *);
1066 count = m->m_len;
1067 mlen_copied = 0;
1068 }
1069
1070 if (__improbable(count == 0)) {
1071 sbaddr = NULL;
1072 continue;
1073 }
1074
1075 if (dbaddr == NULL) {
1076 if (dbufp != NULL) {
1077 __buflet_set_data_length(dbufp, dlen0);
1078 }
1079
1080 PKT_GET_NEXT_BUFLET(dpkt, dbcnt, dbufp, dbuf);
1081 if (__improbable(dbuf == NULL)) {
1082 panic("%s: mbuf too large %p [off %d, "
1083 "len %d]", __func__, m0, off0, len0);
1084 /* NOTREACHED */
1085 __builtin_unreachable();
1086 }
1087 dbufp = dbuf;
1088 dlim = __buflet_get_data_limit(dbuf) - doff;
1089 dbaddr = (uint8_t *)__buflet_get_data_address(dbuf) + doff;
1090 dlen0 = dlim;
1091 doff = 0;
1092 }
1093
1094 count = MIN(count, (unsigned)len);
1095 count0 = count = MIN(count, dlim);
1096
1097 if (!do_cscum) {
1098 _pkt_copy(sbaddr, dbaddr, count);
1099 sbaddr += count;
1100 dbaddr += count;
1101 goto skip_csum;
1102 }
1103
1104 partial = 0;
1105 if ((uintptr_t)sbaddr & 1) {
1106 /* Align on word boundary */
1107 started_on_odd = !started_on_odd;
1108 #if BYTE_ORDER == LITTLE_ENDIAN
1109 partial = *sbaddr << 8;
1110 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1111 partial = *sbaddr;
1112 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1113 *dbaddr++ = *sbaddr++;
1114 count -= 1;
1115 }
1116
1117 needs_swap = started_on_odd;
1118 odd = count & 1u;
1119 count -= odd;
1120
1121 if (count) {
1122 partial = __packet_copy_and_sum(sbaddr,
1123 dbaddr, count, partial);
1124 sbaddr += count;
1125 dbaddr += count;
1126 if (__improbable(partial & 0xc0000000)) {
1127 if (needs_swap) {
1128 partial = (partial << 8) +
1129 (partial >> 24);
1130 }
1131 sum += (partial >> 16);
1132 sum += (partial & 0xffff);
1133 partial = 0;
1134 }
1135 }
1136
1137 if (odd) {
1138 #if BYTE_ORDER == LITTLE_ENDIAN
1139 partial += *sbaddr;
1140 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1141 partial += *sbaddr << 8;
1142 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1143 *dbaddr++ = *sbaddr++;
1144 started_on_odd = !started_on_odd;
1145 }
1146
1147 if (needs_swap) {
1148 partial = (partial << 8) + (partial >> 24);
1149 }
1150 sum += (partial >> 16) + (partial & 0xffff);
1151 /*
1152 * Reduce sum to allow potential byte swap
1153 * in the next iteration without carry.
1154 */
1155 sum = (sum >> 16) + (sum & 0xffff);
1156
1157 skip_csum:
1158 dlim -= count0;
1159 len -= count0;
1160 mlen_copied += count0;
1161
1162 if (dlim == 0) {
1163 dbaddr = NULL;
1164 }
1165
1166 count = m->m_len - soff - mlen_copied;
1167 if (count == 0) {
1168 sbaddr = NULL;
1169 }
1170 }
1171
1172 ASSERT(len == 0);
1173 ASSERT(dbuf != NULL);
1174 __buflet_set_data_length(dbuf, (dlen0 - dlim));
1175
1176 if (!do_cscum) {
1177 return 0;
1178 }
1179
1180 /* Final fold (reduce 32-bit to 16-bit) */
1181 sum = ((sum >> 16) & 0xffff) + (sum & 0xffff);
1182 sum = (sum >> 16) + (sum & 0xffff);
1183 return sum;
1184 }
1185
1186 /*
1187 * This is a multi-buflet variant of pkt_copy_from_mbuf().
1188 *
1189 * start/stuff is relative to moff, within [0, len], such that
1190 * [ 0 ... moff ... moff + start/stuff ... moff + len ... ]
1191 */
1192 void
pkt_copy_multi_buflet_from_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1193 pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph,
1194 const uint16_t poff, struct mbuf *m, const uint16_t moff,
1195 const uint32_t len, const boolean_t copysum, const uint16_t start)
1196 {
1197 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1198 struct m_tag *ts_tag = NULL;
1199 uint32_t partial;
1200 uint16_t csum = 0;
1201 uint8_t *baddr;
1202
1203 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1204
1205 /* get buffer address from packet */
1206 MD_BUFLET_ADDR_ABS(pkt, baddr);
1207 ASSERT(baddr != NULL);
1208 baddr += poff;
1209 VERIFY((poff + len) <= (PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp) *
1210 __packet_get_buflet_count(ph)));
1211
1212 switch (t) {
1213 case NR_RX:
1214 pkt->pkt_csum_flags = m->m_pkthdr.csum_flags;
1215 pkt->pkt_csum_rx_start_off = 0;
1216 pkt->pkt_csum_rx_value = m->m_pkthdr.csum_rx_val;
1217 pkt->pkt_svc_class = m_get_service_class(m);
1218 if (__probable(((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS)
1219 != CSUM_RX_FULL_FLAGS) && copysum)) {
1220 /*
1221 * Use m_copydata() to copy the portion up to the
1222 * point where we need to start the checksum, and
1223 * copy the remainder, checksumming as we go.
1224 */
1225 if (start != 0) {
1226 m_copydata(m, moff, start, baddr);
1227 }
1228 partial = m_copypkt_sum(m, start, ph, (poff + start),
1229 (len - start), TRUE);
1230 csum = __packet_fold_sum(partial);
1231 __packet_set_inet_checksum(ph, PACKET_CSUM_PARTIAL,
1232 start, csum, FALSE);
1233 METADATA_ADJUST_LEN(pkt, start, poff);
1234 } else {
1235 (void) m_copypkt_sum(m, moff, ph, poff, len, FALSE);
1236 }
1237 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1238 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1239 sk_proc_name_address(current_proc()),
1240 sk_proc_pid(current_proc()), len,
1241 (copysum ? (len - start) : 0), csum, start);
1242 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1243 " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x",
1244 SK_KVA(m), m->m_pkthdr.csum_flags,
1245 (uint32_t)m->m_pkthdr.csum_rx_start,
1246 (uint32_t)m->m_pkthdr.csum_rx_val);
1247 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1248 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1249 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1250 (uint32_t)pkt->pkt_csum_rx_start_off,
1251 (uint32_t)pkt->pkt_csum_rx_value);
1252 break;
1253
1254 case NR_TX:
1255 if (copysum) {
1256 uint16_t stuff = m->m_pkthdr.csum_tx_stuff;
1257 /*
1258 * Use m_copydata() to copy the portion up to the
1259 * point where we need to start the checksum, and
1260 * copy the remainder, checksumming as we go.
1261 */
1262 if (start != 0) {
1263 m_copydata(m, moff, start, baddr);
1264 }
1265 partial = m_copypkt_sum(m, start, ph, (poff + start),
1266 (len - start), TRUE);
1267 csum = __packet_fold_sum_final(partial);
1268
1269 /*
1270 * RFC1122 4.1.3.4: Invert 0 to -0 for UDP;
1271 * ideally we'd only test for CSUM_ZERO_INVERT
1272 * here, but catch cases where the originator
1273 * did not set it for UDP.
1274 */
1275 if (csum == 0 && (m->m_pkthdr.csum_flags &
1276 (CSUM_UDP | CSUM_UDPIPV6 | CSUM_ZERO_INVERT))) {
1277 csum = 0xffff;
1278 }
1279
1280 /* Insert checksum into packet */
1281 ASSERT(stuff <= (len - sizeof(csum)));
1282 if (IS_P2ALIGNED(baddr + stuff, sizeof(csum))) {
1283 *(uint16_t *)(uintptr_t)(baddr + stuff) = csum;
1284 } else {
1285 bcopy((void *)&csum, baddr + stuff,
1286 sizeof(csum));
1287 }
1288 METADATA_ADJUST_LEN(pkt, start, poff);
1289 } else {
1290 m_copypkt_sum(m, moff, ph, poff, len, FALSE);
1291 }
1292 pkt->pkt_csum_flags = 0;
1293 pkt->pkt_csum_tx_start_off = 0;
1294 pkt->pkt_csum_tx_stuff_off = 0;
1295
1296 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1297 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV4;
1298 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
1299 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV6) == 0);
1300 }
1301 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) {
1302 pkt->pkt_csum_flags |= PACKET_CSUM_TSO_IPV6;
1303 pkt->pkt_proto_seg_sz = (uint16_t)m->m_pkthdr.tso_segsz;
1304 ASSERT((pkt->pkt_csum_flags & PACKET_TSO_IPV4) == 0);
1305 }
1306 if (!copysum) {
1307 pkt->pkt_csum_flags |= _convert_mbuf_csum_flags(m->m_pkthdr.csum_flags);
1308 }
1309
1310 /* translate mbuf metadata */
1311 pkt->pkt_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
1312 pkt->pkt_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
1313 pkt->pkt_flow_token = m->m_pkthdr.pkt_flowid;
1314 pkt->pkt_comp_gencnt = m->m_pkthdr.comp_gencnt;
1315 switch (m->m_pkthdr.pkt_proto) {
1316 case IPPROTO_QUIC:
1317 pkt->pkt_flow_ip_proto = IPPROTO_UDP;
1318 pkt->pkt_transport_protocol = IPPROTO_QUIC;
1319 break;
1320
1321 default:
1322 pkt->pkt_flow_ip_proto = m->m_pkthdr.pkt_proto;
1323 pkt->pkt_transport_protocol = m->m_pkthdr.pkt_proto;
1324 break;
1325 }
1326 (void) mbuf_get_timestamp(m, &pkt->pkt_timestamp, NULL);
1327 pkt->pkt_svc_class = m_get_service_class(m);
1328 pkt->pkt_pflags &= ~PKT_F_COMMON_MASK;
1329 pkt->pkt_pflags |= (m->m_pkthdr.pkt_flags & PKT_F_COMMON_MASK);
1330 if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) {
1331 pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq);
1332 }
1333 if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) {
1334 pkt->pkt_pflags |= PKT_F_L4S;
1335 }
1336 necp_get_app_uuid_from_packet(m, pkt->pkt_policy_euuid);
1337 pkt->pkt_policy_id =
1338 (uint32_t)necp_get_policy_id_from_packet(m);
1339 pkt->pkt_skip_policy_id =
1340 (uint32_t)necp_get_skip_policy_id_from_packet(m);
1341
1342 if ((m->m_pkthdr.pkt_flags & PKTF_TX_COMPL_TS_REQ) != 0) {
1343 if ((m->m_pkthdr.pkt_flags & PKTF_DRIVER_MTAG) != 0) {
1344 __packet_set_tx_completion_data(ph,
1345 m->m_pkthdr.drv_tx_compl_arg,
1346 m->m_pkthdr.drv_tx_compl_data);
1347 }
1348 pkt->pkt_tx_compl_context =
1349 m->m_pkthdr.pkt_compl_context;
1350 pkt->pkt_tx_compl_callbacks =
1351 m->m_pkthdr.pkt_compl_callbacks;
1352 /*
1353 * Remove PKTF_TX_COMPL_TS_REQ flag so that this
1354 * mbuf can no longer trigger a completion callback.
1355 * callback will be invoked when the kernel packet is
1356 * completed.
1357 */
1358 m->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
1359
1360 m_add_crumb(m, PKT_CRUMB_SK_PKT_COPY);
1361 }
1362
1363 ts_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM);
1364 if (ts_tag != NULL) {
1365 __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data));
1366 }
1367
1368 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1369 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1370 sk_proc_name_address(current_proc()),
1371 sk_proc_pid(current_proc()), len,
1372 (copysum ? (len - start) : 0), csum, start);
1373 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1374 " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u",
1375 SK_KVA(m), m->m_pkthdr.csum_flags,
1376 (uint32_t)m->m_pkthdr.csum_tx_start,
1377 (uint32_t)m->m_pkthdr.csum_tx_stuff);
1378 break;
1379
1380 default:
1381 VERIFY(0);
1382 /* NOTREACHED */
1383 __builtin_unreachable();
1384 }
1385
1386 if (m->m_flags & M_BCAST) {
1387 __packet_set_link_broadcast(ph);
1388 } else if (m->m_flags & M_MCAST) {
1389 __packet_set_link_multicast(ph);
1390 }
1391
1392 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1393 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1394 (t == NR_RX) ? "RX" : "TX",
1395 sk_dump("buf", baddr, len, 128, NULL, 0));
1396 }
1397
1398 static inline uint32_t
_convert_pkt_csum_flags(uint32_t pkt_flags)1399 _convert_pkt_csum_flags(uint32_t pkt_flags)
1400 {
1401 uint32_t mbuf_flags = 0;
1402 if (pkt_flags & PACKET_CSUM_TCP) {
1403 mbuf_flags |= CSUM_TCP;
1404 }
1405 if (pkt_flags & PACKET_CSUM_TCPIPV6) {
1406 mbuf_flags |= CSUM_TCPIPV6;
1407 }
1408 if (pkt_flags & PACKET_CSUM_UDP) {
1409 mbuf_flags |= CSUM_UDP;
1410 }
1411 if (pkt_flags & PACKET_CSUM_UDPIPV6) {
1412 mbuf_flags |= CSUM_UDPIPV6;
1413 }
1414 if (pkt_flags & PACKET_CSUM_IP) {
1415 mbuf_flags |= CSUM_IP;
1416 }
1417 if (pkt_flags & PACKET_CSUM_ZERO_INVERT) {
1418 mbuf_flags |= CSUM_ZERO_INVERT;
1419 }
1420 if (pkt_flags & PACKET_CSUM_TSO_IPV4) {
1421 mbuf_flags |= CSUM_TSO_IPV4;
1422 }
1423 if (pkt_flags & PACKET_CSUM_TSO_IPV6) {
1424 mbuf_flags |= CSUM_TSO_IPV6;
1425 }
1426
1427 return mbuf_flags;
1428 }
1429
1430 /*
1431 * This routine is used for copying from a packet originating from a native
1432 * skywalk interface to an mbuf destined for the host legacy stack (NR_RX),
1433 * as well as for mbufs destined for the compat network interfaces (NR_TX).
1434 *
1435 * We do adjust the length to reflect the total data span.
1436 *
1437 * This routine supports copying into an mbuf chain for RX but not TX.
1438 *
1439 * start/stuff is relative to poff, within [0, len], such that
1440 * [ 0 ... poff ... poff + start/stuff ... poff + len ... ]
1441 */
1442 void
pkt_copy_to_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1443 pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff,
1444 struct mbuf *m, const uint16_t moff, const uint32_t len,
1445 const boolean_t copysum, const uint16_t start)
1446 {
1447 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1448 struct mbuf *curr_m;
1449 uint32_t partial = 0;
1450 uint32_t remaining_len = len, copied_len = 0;
1451 uint16_t csum = 0;
1452 uint8_t *baddr;
1453 uint8_t *dp;
1454 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt);
1455
1456 ASSERT(len >= start);
1457 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1458
1459 /* get buffer address from packet */
1460 MD_BUFLET_ADDR_ABS(pkt, baddr);
1461 ASSERT(baddr != NULL);
1462 baddr += poff;
1463 VERIFY((poff + len) <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
1464
1465 ASSERT((m->m_flags & M_PKTHDR));
1466 m->m_data += moff;
1467
1468 switch (t) {
1469 case NR_RX:
1470 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
1471
1472 /*
1473 * Use pkt_copy() to copy the portion up to the
1474 * point where we need to start the checksum, and
1475 * copy the remainder, checksumming as we go.
1476 */
1477 if (__probable(do_sum && start != 0)) {
1478 ASSERT(M_TRAILINGSPACE(m) >= start);
1479 ASSERT(m->m_len == 0);
1480 dp = (uint8_t *)m_mtod_current(m);
1481 _pkt_copy(baddr, dp, start);
1482 remaining_len -= start;
1483 copied_len += start;
1484 m->m_len += start;
1485 m->m_pkthdr.len += start;
1486 }
1487 curr_m = m;
1488 while (curr_m != NULL && remaining_len != 0) {
1489 uint32_t tmp_len = MIN(remaining_len,
1490 (uint32_t)M_TRAILINGSPACE(curr_m));
1491 dp = (uint8_t *)m_mtod_end(curr_m);
1492 if (__probable(do_sum)) {
1493 partial = __packet_copy_and_sum((baddr + copied_len),
1494 dp, tmp_len, partial);
1495 } else {
1496 _pkt_copy((baddr + copied_len), dp, tmp_len);
1497 }
1498
1499 curr_m->m_len += tmp_len;
1500 m->m_pkthdr.len += tmp_len;
1501 copied_len += tmp_len;
1502 remaining_len -= tmp_len;
1503 curr_m = curr_m->m_next;
1504 }
1505 ASSERT(remaining_len == 0);
1506
1507 if (__probable(do_sum)) {
1508 csum = __packet_fold_sum(partial);
1509
1510 m->m_pkthdr.csum_flags |=
1511 (CSUM_DATA_VALID | CSUM_PARTIAL);
1512 m->m_pkthdr.csum_rx_start = start;
1513 m->m_pkthdr.csum_rx_val = csum;
1514 } else {
1515 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
1516 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
1517 _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS);
1518 m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS;
1519 if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) {
1520 m->m_pkthdr.csum_flags |= CSUM_PARTIAL;
1521 }
1522 }
1523
1524 /* translate packet metadata */
1525 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1526 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1527
1528 m->m_pkthdr.rx_seg_cnt = pkt->pkt_seg_cnt;
1529
1530 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1531 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1532 sk_proc_name_address(current_proc()),
1533 sk_proc_pid(current_proc()), len,
1534 (copysum ? (len - start) : 0), csum, start);
1535 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1536 " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1537 SK_KVA(m), moff, m->m_pkthdr.csum_flags,
1538 (uint32_t)m->m_pkthdr.csum_rx_start,
1539 (uint32_t)m->m_pkthdr.csum_rx_val);
1540 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1541 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1542 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1543 (uint32_t)pkt->pkt_csum_rx_start_off,
1544 (uint32_t)pkt->pkt_csum_rx_value);
1545 break;
1546
1547 case NR_TX:
1548 dp = (uint8_t *)m_mtod_current(m);
1549 ASSERT(m->m_next == NULL);
1550
1551 VERIFY(((intptr_t)dp - (intptr_t)mbuf_datastart(m)) + len <=
1552 (uint32_t)mbuf_maxlen(m));
1553 m->m_len += len;
1554 m->m_pkthdr.len += len;
1555 VERIFY(m->m_len == m->m_pkthdr.len &&
1556 (uint32_t)m->m_len <= (uint32_t)mbuf_maxlen(m));
1557
1558 if (copysum) {
1559 uint16_t stuff = pkt->pkt_csum_tx_stuff_off;
1560 /*
1561 * Use pkt_copy() to copy the portion up to the
1562 * point where we need to start the checksum, and
1563 * copy the remainder, checksumming as we go.
1564 */
1565 if (__probable(start != 0)) {
1566 _pkt_copy(baddr, dp, start);
1567 }
1568 partial = __packet_copy_and_sum((baddr + start),
1569 (dp + start), (len - start), 0);
1570 csum = __packet_fold_sum_final(partial);
1571
1572 /* RFC1122 4.1.3.4: Invert 0 to -0 (for UDP) */
1573 if (csum == 0 &&
1574 (pkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)) {
1575 csum = 0xffff;
1576 }
1577
1578 /* Insert checksum into packet */
1579 ASSERT(stuff <= (len - sizeof(csum)));
1580 if (IS_P2ALIGNED(dp + stuff, sizeof(csum))) {
1581 *(uint16_t *)(uintptr_t)(dp + stuff) = csum;
1582 } else {
1583 bcopy((void *)&csum, dp + stuff, sizeof(csum));
1584 }
1585 } else {
1586 _pkt_copy(baddr, dp, len);
1587 }
1588 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
1589 m->m_pkthdr.csum_tx_start = 0;
1590 m->m_pkthdr.csum_tx_stuff = 0;
1591 m->m_pkthdr.csum_flags |= _convert_pkt_csum_flags(pkt->pkt_csum_flags);
1592
1593 /* translate packet metadata */
1594 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
1595 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
1596 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
1597 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
1598 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
1599 m->m_pkthdr.tso_segsz = pkt->pkt_proto_seg_sz;
1600 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
1601 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1602 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1603 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
1604 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
1605 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
1606 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
1607 }
1608 if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
1609 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
1610 }
1611 if (__improbable(copy_pkt_tx_time != 0 &&
1612 (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) {
1613 struct m_tag *tag = NULL;
1614 tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
1615 sizeof(uint64_t), M_WAITOK, m);
1616 if (tag != NULL) {
1617 m_tag_prepend(m, tag);
1618 *(uint64_t *)tag->m_tag_data = pkt->pkt_com_opt->__po_pkt_tx_time;
1619 }
1620 }
1621 m->m_pkthdr.necp_mtag.necp_policy_id = pkt->pkt_policy_id;
1622 m->m_pkthdr.necp_mtag.necp_skip_policy_id = pkt->pkt_skip_policy_id;
1623
1624 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1625 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1626 sk_proc_name_address(current_proc()),
1627 sk_proc_pid(current_proc()), len,
1628 (copysum ? (len - start) : 0), csum, start);
1629 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1630 " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u",
1631 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1632 (uint32_t)pkt->pkt_csum_tx_start_off,
1633 (uint32_t)pkt->pkt_csum_tx_stuff_off);
1634 break;
1635
1636 default:
1637 VERIFY(0);
1638 /* NOTREACHED */
1639 __builtin_unreachable();
1640 }
1641
1642 if (pkt->pkt_link_flags & PKT_LINKF_BCAST) {
1643 m->m_flags |= M_BCAST;
1644 } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) {
1645 m->m_flags |= M_MCAST;
1646 }
1647 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1648 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1649 (t == NR_RX) ? "RX" : "TX",
1650 sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0));
1651 }
1652
1653 /*
1654 * This is a multi-buflet variant of pkt_copy_to_mbuf().
1655 * NOTE: poff is the offset within the packet.
1656 *
1657 * This routine supports copying into an mbuf chain for RX but not TX.
1658 *
1659 * start/stuff is relative to poff, within [0, len], such that
1660 * [ 0 ... poff ... poff + start/stuff ... poff + len ... ]
1661 */
1662 void
pkt_copy_multi_buflet_to_mbuf(const enum txrx t,kern_packet_t ph,const uint16_t poff,struct mbuf * m,const uint16_t moff,const uint32_t len,const boolean_t copysum,const uint16_t start)1663 pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph,
1664 const uint16_t poff, struct mbuf *m, const uint16_t moff,
1665 const uint32_t len, const boolean_t copysum, const uint16_t start)
1666 {
1667 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1668 struct mbuf *curr_m;
1669 uint32_t partial = 0;
1670 uint32_t remaining_len = len, copied_len = 0;
1671 uint16_t csum = 0;
1672 uint8_t *baddr;
1673 uint8_t *dp;
1674 boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt);
1675
1676 ASSERT(len >= start);
1677 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1678
1679 /* get buffer address from packet */
1680 MD_BUFLET_ADDR_ABS(pkt, baddr);
1681 ASSERT(baddr != NULL);
1682 baddr += poff;
1683
1684 ASSERT((m->m_flags & M_PKTHDR));
1685 m->m_data += moff;
1686
1687 switch (t) {
1688 case NR_RX:
1689 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
1690 if (__probable(do_sum && start != 0)) {
1691 ASSERT(M_TRAILINGSPACE(m) >= start);
1692 ASSERT(m->m_len == 0);
1693 dp = (uint8_t *)m_mtod_current(m);
1694 _pkt_copy(baddr, dp, start);
1695 remaining_len -= start;
1696 copied_len += start;
1697 m->m_len += start;
1698 m->m_pkthdr.len += start;
1699 }
1700 curr_m = m;
1701 while (curr_m != NULL && remaining_len != 0) {
1702 uint32_t tmp_len = MIN(remaining_len,
1703 (uint32_t)M_TRAILINGSPACE(curr_m));
1704 uint16_t soff = poff + (uint16_t)copied_len;
1705 dp = (uint8_t *)m_mtod_end(curr_m);
1706
1707 if (__probable(do_sum)) {
1708 partial = _pkt_copyaddr_sum(ph, soff,
1709 dp, tmp_len, TRUE, partial, NULL);
1710 } else {
1711 pkt_copyaddr_sum(ph, soff,
1712 dp, tmp_len, FALSE, 0, NULL);
1713 }
1714
1715 curr_m->m_len += tmp_len;
1716 m->m_pkthdr.len += tmp_len;
1717 copied_len += tmp_len;
1718 remaining_len -= tmp_len;
1719 curr_m = curr_m->m_next;
1720 }
1721 ASSERT(remaining_len == 0);
1722
1723 if (__probable(do_sum)) {
1724 csum = __packet_fold_sum(partial);
1725
1726 m->m_pkthdr.csum_flags |=
1727 (CSUM_DATA_VALID | CSUM_PARTIAL);
1728 m->m_pkthdr.csum_rx_start = start;
1729 m->m_pkthdr.csum_rx_val = csum;
1730 } else {
1731 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
1732 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
1733 _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS);
1734 m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS;
1735 if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) {
1736 m->m_pkthdr.csum_flags |= CSUM_PARTIAL;
1737 }
1738 }
1739
1740 m->m_pkthdr.necp_mtag.necp_policy_id = pkt->pkt_policy_id;
1741 m->m_pkthdr.necp_mtag.necp_skip_policy_id = pkt->pkt_skip_policy_id;
1742
1743 /* translate packet metadata */
1744 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1745 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1746
1747 m->m_pkthdr.rx_seg_cnt = pkt->pkt_seg_cnt;
1748
1749 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1750 "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u",
1751 sk_proc_name_address(current_proc()),
1752 sk_proc_pid(current_proc()), len,
1753 (copysum ? (len - start) : 0), csum, start);
1754 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1755 " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1756 SK_KVA(m), moff, m->m_pkthdr.csum_flags,
1757 (uint32_t)m->m_pkthdr.csum_rx_start,
1758 (uint32_t)m->m_pkthdr.csum_rx_val);
1759 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX,
1760 " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x",
1761 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1762 (uint32_t)pkt->pkt_csum_rx_start_off,
1763 (uint32_t)pkt->pkt_csum_rx_value);
1764 break;
1765 case NR_TX:
1766 ASSERT(len <= M16KCLBYTES);
1767 dp = (uint8_t *)m_mtod_current(m);
1768 ASSERT(m->m_next == NULL);
1769 VERIFY(((intptr_t)dp - (intptr_t)mbuf_datastart(m)) + len <=
1770 (uint32_t)mbuf_maxlen(m));
1771 m->m_len += len;
1772 m->m_pkthdr.len += len;
1773 VERIFY(m->m_len == m->m_pkthdr.len &&
1774 (uint32_t)m->m_len <= (uint32_t)mbuf_maxlen(m));
1775 if (copysum) {
1776 uint16_t stuff = pkt->pkt_csum_tx_stuff_off;
1777 /*
1778 * Use pkt_copy() to copy the portion up to the
1779 * point where we need to start the checksum, and
1780 * copy the remainder, checksumming as we go.
1781 */
1782 if (__probable(start != 0)) {
1783 _pkt_copy(baddr, dp, start);
1784 }
1785 partial = _pkt_copyaddr_sum(ph, (poff + start),
1786 (dp + start), (len - start), TRUE, 0, NULL);
1787 csum = __packet_fold_sum_final(partial);
1788
1789 /* RFC1122 4.1.3.4: Invert 0 to -0 (for UDP) */
1790 if (csum == 0 &&
1791 (pkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)) {
1792 csum = 0xffff;
1793 }
1794
1795 /* Insert checksum into packet */
1796 ASSERT(stuff <= (len - sizeof(csum)));
1797 if (IS_P2ALIGNED(dp + stuff, sizeof(csum))) {
1798 *(uint16_t *)(uintptr_t)(dp + stuff) = csum;
1799 } else {
1800 bcopy((void *)&csum, dp + stuff, sizeof(csum));
1801 }
1802 } else {
1803 (void) _pkt_copyaddr_sum(ph, poff, dp, len, FALSE, 0, NULL);
1804 }
1805 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
1806 m->m_pkthdr.csum_tx_start = 0;
1807 m->m_pkthdr.csum_tx_stuff = 0;
1808 m->m_pkthdr.csum_flags |= _convert_pkt_csum_flags(pkt->pkt_csum_flags);
1809
1810 /* translate packet metadata */
1811 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
1812 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
1813 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
1814 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
1815 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
1816 m->m_pkthdr.tso_segsz = pkt->pkt_proto_seg_sz;
1817 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
1818 mbuf_set_timestamp(m, pkt->pkt_timestamp,
1819 ((pkt->pkt_pflags & PKT_F_TS_VALID) != 0));
1820 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
1821 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
1822 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
1823 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
1824 }
1825 if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
1826 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
1827 }
1828 if (__improbable(copy_pkt_tx_time != 0 &&
1829 (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) {
1830 struct m_tag *tag = NULL;
1831 tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM,
1832 sizeof(uint64_t), M_WAITOK, m);
1833 if (tag != NULL) {
1834 m_tag_prepend(m, tag);
1835 *(uint64_t *)tag->m_tag_data = pkt->pkt_com_opt->__po_pkt_tx_time;
1836 }
1837 }
1838
1839 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1840 "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u",
1841 sk_proc_name_address(current_proc()),
1842 sk_proc_pid(current_proc()), len,
1843 (copysum ? (len - start) : 0), csum, start);
1844 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX,
1845 " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u",
1846 SK_KVA(pkt), poff, pkt->pkt_csum_flags,
1847 (uint32_t)pkt->pkt_csum_tx_start_off,
1848 (uint32_t)pkt->pkt_csum_tx_stuff_off);
1849 break;
1850
1851 default:
1852 VERIFY(0);
1853 /* NOTREACHED */
1854 __builtin_unreachable();
1855 }
1856
1857 if (pkt->pkt_link_flags & PKT_LINKF_BCAST) {
1858 m->m_flags |= M_BCAST;
1859 } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) {
1860 m->m_flags |= M_MCAST;
1861 }
1862 SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s",
1863 sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()),
1864 (t == NR_RX) ? "RX" : "TX",
1865 sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0));
1866 }
1867
1868 /*
1869 * Like m_copydata(), but computes 16-bit sum as the data is copied.
1870 * Caller can provide an initial sum to be folded into the computed
1871 * sum. The accumulated partial sum (32-bit) is returned to caller;
1872 * caller is responsible for further reducing it to 16-bit if needed,
1873 * as well as to perform the final 1's complement on it.
1874 */
1875 uint32_t
m_copydata_sum(struct mbuf * m,int off,int len,void * __sized_by (len)vp,uint32_t initial_sum,boolean_t * odd_start)1876 m_copydata_sum(struct mbuf *m, int off, int len, void *__sized_by(len)vp, uint32_t initial_sum,
1877 boolean_t *odd_start)
1878 {
1879 boolean_t needs_swap, started_on_odd = FALSE;
1880 int off0 = off, len0 = len;
1881 struct mbuf *m0 = m;
1882 uint64_t sum, partial;
1883 unsigned count, odd;
1884 char *cp = vp;
1885
1886 if (__improbable(off < 0 || len < 0)) {
1887 panic("%s: invalid offset %d or len %d", __func__, off, len);
1888 /* NOTREACHED */
1889 __builtin_unreachable();
1890 }
1891
1892 while (off > 0) {
1893 if (__improbable(m == NULL)) {
1894 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1895 __func__, m0, off0, len0);
1896 /* NOTREACHED */
1897 __builtin_unreachable();
1898 }
1899 if (off < m->m_len) {
1900 break;
1901 }
1902 off -= m->m_len;
1903 m = m->m_next;
1904 }
1905
1906 if (odd_start) {
1907 started_on_odd = *odd_start;
1908 }
1909 sum = initial_sum;
1910
1911 for (; len0 > 0; m = m->m_next) {
1912 uint8_t *datap;
1913
1914 if (__improbable(m == NULL)) {
1915 panic("%s: invalid mbuf chain %p [off %d, len %d]",
1916 __func__, m0, off0, len);
1917 /* NOTREACHED */
1918 __builtin_unreachable();
1919 }
1920
1921 datap = mtod(m, uint8_t *) + off;
1922 count = m->m_len;
1923
1924 if (__improbable(count == 0)) {
1925 continue;
1926 }
1927
1928 count = MIN(count - off, (unsigned)len0);
1929 partial = 0;
1930
1931 if ((uintptr_t)datap & 1) {
1932 /* Align on word boundary */
1933 started_on_odd = !started_on_odd;
1934 #if BYTE_ORDER == LITTLE_ENDIAN
1935 partial = *datap << 8;
1936 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1937 partial = *datap;
1938 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1939 *cp++ = *datap++;
1940 count -= 1;
1941 len0 -= 1;
1942 }
1943
1944 needs_swap = started_on_odd;
1945 odd = count & 1u;
1946 count -= odd;
1947
1948 if (count) {
1949 partial = __packet_copy_and_sum(datap,
1950 cp, count, (uint32_t)partial);
1951 datap += count;
1952 cp += count;
1953 len0 -= count;
1954 if (__improbable((partial & (3ULL << 62)) != 0)) {
1955 if (needs_swap) {
1956 partial = (partial << 8) +
1957 (partial >> 56);
1958 }
1959 sum += (partial >> 32);
1960 sum += (partial & 0xffffffff);
1961 partial = 0;
1962 }
1963 }
1964
1965 if (odd) {
1966 #if BYTE_ORDER == LITTLE_ENDIAN
1967 partial += *datap;
1968 #else /* BYTE_ORDER != LITTLE_ENDIAN */
1969 partial += *datap << 8;
1970 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
1971 *cp++ = *datap++;
1972 len0 -= 1;
1973 started_on_odd = !started_on_odd;
1974 }
1975 off = 0;
1976
1977 if (needs_swap) {
1978 partial = (partial << 8) + (partial >> 24);
1979 }
1980 sum += (partial >> 32) + (partial & 0xffffffff);
1981 /*
1982 * Reduce sum to allow potential byte swap
1983 * in the next iteration without carry.
1984 */
1985 sum = (sum >> 32) + (sum & 0xffffffff);
1986 }
1987
1988 if (odd_start) {
1989 *odd_start = started_on_odd;
1990 }
1991
1992 /* Final fold (reduce 64-bit to 32-bit) */
1993 sum = (sum >> 32) + (sum & 0xffffffff); /* 33-bit */
1994 sum = (sum >> 16) + (sum & 0xffff); /* 17-bit + carry */
1995
1996 /* return 32-bit partial sum to caller */
1997 return (uint32_t)sum;
1998 }
1999
2000 #if DEBUG || DEVELOPMENT
2001 #define TRAILERS_MAX 16 /* max trailing bytes */
2002 #define TRAILERS_REGEN (64 * 1024) /* regeneration threshold */
2003 static uint8_t tb[TRAILERS_MAX]; /* random trailing bytes */
2004 static uint32_t regen = TRAILERS_REGEN; /* regeneration counter */
2005
2006 uint32_t
pkt_add_trailers(kern_packet_t ph,const uint32_t len,const uint16_t start)2007 pkt_add_trailers(kern_packet_t ph, const uint32_t len, const uint16_t start)
2008 {
2009 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
2010 uint32_t extra;
2011 uint8_t *baddr;
2012
2013 /* get buffer address from packet */
2014 MD_BUFLET_ADDR_ABS(pkt, baddr);
2015 ASSERT(baddr != NULL);
2016 ASSERT(len <= PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp));
2017
2018 extra = MIN((uint32_t)pkt_trailers, (uint32_t)TRAILERS_MAX);
2019 if (extra == 0 || extra > sizeof(tb) ||
2020 (len + extra) > PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp)) {
2021 return 0;
2022 }
2023
2024 /* generate random bytes once per TRAILERS_REGEN packets (approx.) */
2025 if (regen++ == TRAILERS_REGEN) {
2026 read_frandom(&tb[0], sizeof(tb));
2027 regen = 0;
2028 }
2029
2030 bcopy(&tb[0], (baddr + len), extra);
2031
2032 /* recompute partial sum (also to exercise related logic) */
2033 pkt->pkt_csum_flags |= PACKET_CSUM_PARTIAL;
2034 pkt->pkt_csum_rx_value = (uint16_t)__packet_cksum((baddr + start),
2035 ((len + extra) - start), 0);
2036 pkt->pkt_csum_rx_start_off = start;
2037
2038 return extra;
2039 }
2040
2041 uint32_t
pkt_add_trailers_mbuf(struct mbuf * m,const uint16_t start)2042 pkt_add_trailers_mbuf(struct mbuf *m, const uint16_t start)
2043 {
2044 uint32_t extra;
2045
2046 extra = MIN((uint32_t)pkt_trailers, (uint32_t)TRAILERS_MAX);
2047 if (extra == 0 || extra > sizeof(tb)) {
2048 return 0;
2049 }
2050
2051 if (mbuf_copyback(m, m_pktlen(m), extra, &tb[0], M_NOWAIT) != 0) {
2052 return 0;
2053 }
2054
2055 /* generate random bytes once per TRAILERS_REGEN packets (approx.) */
2056 if (regen++ == TRAILERS_REGEN) {
2057 read_frandom(&tb[0], sizeof(tb));
2058 regen = 0;
2059 }
2060
2061 /* recompute partial sum (also to exercise related logic) */
2062 m->m_pkthdr.csum_rx_val = m_sum16(m, start, (m_pktlen(m) - start));
2063 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2064 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
2065 m->m_pkthdr.csum_rx_start = start;
2066
2067 return extra;
2068 }
2069 #endif /* DEBUG || DEVELOPMENT */
2070
2071 void
pkt_copypkt_sum(kern_packet_t sph,uint16_t soff,kern_packet_t dph,uint16_t doff,uint16_t len,uint32_t * partial,boolean_t do_csum)2072 pkt_copypkt_sum(kern_packet_t sph, uint16_t soff, kern_packet_t dph,
2073 uint16_t doff, uint16_t len, uint32_t *partial, boolean_t do_csum)
2074 {
2075 VERIFY(_pkt_copypkt_sum(sph, soff, dph, doff, len, partial, do_csum));
2076 }
2077
2078 uint32_t
pkt_copyaddr_sum(kern_packet_t sph,uint16_t soff,uint8_t * __sized_by (len)dbaddr,uint32_t len,boolean_t do_csum,uint32_t initial_sum,boolean_t * odd_start)2079 pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbaddr,
2080 uint32_t len, boolean_t do_csum, uint32_t initial_sum, boolean_t *odd_start)
2081 {
2082 return _pkt_copyaddr_sum(sph, soff, dbaddr, len, do_csum, initial_sum, odd_start);
2083 }
2084
2085 uint32_t
pkt_mcopypkt_sum(mbuf_t m,int soff,kern_packet_t dph,uint16_t doff,uint16_t len,boolean_t do_cscum)2086 pkt_mcopypkt_sum(mbuf_t m, int soff, kern_packet_t dph, uint16_t doff,
2087 uint16_t len, boolean_t do_cscum)
2088 {
2089 return m_copypkt_sum(m, soff, dph, doff, len, do_cscum);
2090 }
2091
2092 void
pkt_copy(void * __sized_by (len)src,void * __sized_by (len)dst,size_t len)2093 pkt_copy(void *__sized_by(len)src, void *__sized_by(len)dst, size_t len)
2094 {
2095 return _pkt_copy(src, dst, len);
2096 }
2097