1 /*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/cdefs.h>
30
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39
40 #include <dev/random/randomdev.h>
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_dl.h>
44 #include <net/if_types.h>
45 #include <net/net_osdep.h>
46 #include <net/pktsched/pktsched.h>
47 #include <net/pktsched/pktsched_fq_codel.h>
48 #include <net/pktsched/pktsched_netem.h>
49
50 #define _IP_VHL
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53
54 #include <pexpert/pexpert.h>
55
56 #if SKYWALK
57 #include <skywalk/os_skywalk_private.h>
58 #endif /* SKYWALK */
59
60 u_int32_t machclk_freq = 0;
61 u_int64_t machclk_per_sec = 0;
62 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
63
64 static void init_machclk(void);
65
66 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
67
68 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
69 &pktsched_verbose, 0, "Packet scheduler verbosity level");
70
71 void
pktsched_init(void)72 pktsched_init(void)
73 {
74 init_machclk();
75 if (machclk_freq == 0) {
76 panic("%s: no CPU clock available!", __func__);
77 /* NOTREACHED */
78 }
79 pktsched_fq_init();
80 }
81
82 static void
init_machclk(void)83 init_machclk(void)
84 {
85 /*
86 * Initialize machclk_freq using the timerbase frequency
87 * value from device specific info.
88 */
89 machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
90
91 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
92 &machclk_per_sec);
93 }
94
95 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)96 pktsched_abs_to_nsecs(u_int64_t abstime)
97 {
98 u_int64_t nsecs;
99
100 absolutetime_to_nanoseconds(abstime, &nsecs);
101 return nsecs;
102 }
103
104 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)105 pktsched_nsecs_to_abstime(u_int64_t nsecs)
106 {
107 u_int64_t abstime;
108
109 nanoseconds_to_absolutetime(nsecs, &abstime);
110 return abstime;
111 }
112
113 int
pktsched_setup(struct ifclassq * ifq,u_int32_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)114 pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
115 classq_pkt_type_t ptype)
116 {
117 int error = 0;
118 u_int32_t rflags;
119
120 IFCQ_LOCK_ASSERT_HELD(ifq);
121
122 VERIFY(machclk_freq != 0);
123
124 /* Nothing to do unless the scheduler type changes */
125 if (ifq->ifcq_type == scheduler) {
126 return 0;
127 }
128
129 /*
130 * Remember the flags that need to be restored upon success, as
131 * they may be cleared when we tear down existing scheduler.
132 */
133 rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
134
135 if (ifq->ifcq_type != PKTSCHEDT_NONE) {
136 pktsched_teardown(ifq);
137
138 /* Teardown should have succeeded */
139 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
140 VERIFY(ifq->ifcq_disc == NULL);
141 }
142
143 error = fq_if_setup_ifclassq(ifq, sflags, ptype);
144 if (error == 0) {
145 ifq->ifcq_flags |= rflags;
146 }
147
148 return error;
149 }
150
151 void
pktsched_teardown(struct ifclassq * ifq)152 pktsched_teardown(struct ifclassq *ifq)
153 {
154 IFCQ_LOCK_ASSERT_HELD(ifq);
155 if_qflush(ifq->ifcq_ifp, ifq, true);
156 VERIFY(IFCQ_IS_EMPTY(ifq));
157 ifq->ifcq_flags &= ~IFCQF_ENABLED;
158 if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
159 /* Could be PKTSCHEDT_NONE */
160 fq_if_teardown_ifclassq(ifq);
161 }
162 return;
163 }
164
165 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)166 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
167 struct if_ifclassq_stats *ifqs)
168 {
169 int error = 0;
170
171 IFCQ_LOCK_ASSERT_HELD(ifq);
172
173 if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
174 /* Could be PKTSCHEDT_NONE */
175 error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs);
176 }
177
178 return error;
179 }
180
181 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)182 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
183 {
184 pkt->pktsched_pkt = *cpkt;
185 pkt->pktsched_tail = *cpkt;
186 pkt->pktsched_pcnt = 1;
187
188 switch (cpkt->cp_ptype) {
189 case QP_MBUF:
190 pkt->pktsched_plen =
191 (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
192 break;
193
194 #if SKYWALK
195 case QP_PACKET:
196 pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
197 break;
198 #endif /* SKYWALK */
199
200 default:
201 VERIFY(0);
202 /* NOTREACHED */
203 __builtin_unreachable();
204 }
205 }
206
207 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)208 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
209 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
210 {
211 pkt->pktsched_pkt = *cpkt;
212 pkt->pktsched_tail = *tail;
213 pkt->pktsched_pcnt = cnt;
214 pkt->pktsched_plen = bytes;
215
216 switch (cpkt->cp_ptype) {
217 case QP_MBUF:
218 break;
219
220 #if SKYWALK
221 case QP_PACKET:
222 break;
223 #endif /* SKYWALK */
224
225 default:
226 VERIFY(0);
227 /* NOTREACHED */
228 __builtin_unreachable();
229 }
230 }
231
232 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)233 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
234 {
235 struct mbuf *m1, *m2;
236 #if SKYWALK
237 struct __kern_packet *p1;
238 kern_packet_t ph2;
239 int err;
240 #endif /* SKYWALK */
241
242 ASSERT(pkt1 != NULL);
243 ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
244 ASSERT(pkt1->pktsched_pcnt == 1);
245
246 /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
247 ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
248 pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
249 pkt2->pktsched_pkt_mbuf == NULL));
250
251 switch (pkt1->pktsched_ptype) {
252 case QP_MBUF:
253 m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
254 m2 = m_dup(m1, M_NOWAIT);
255 if (__improbable(m2 == NULL)) {
256 return ENOBUFS;
257 }
258 pkt2->pktsched_pkt_mbuf = m2;
259 break;
260
261 #if SKYWALK
262 case QP_PACKET:
263 p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
264 err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
265 METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
266 KPKT_COPY_HEAVY);
267 if (__improbable(err != 0)) {
268 return err;
269 }
270 ASSERT(ph2 != 0);
271 VERIFY(kern_packet_finalize(ph2) == 0);
272 pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
273 break;
274 #endif /* SKYWALK */
275
276 default:
277 VERIFY(0);
278 /* NOTREACHED */
279 __builtin_unreachable();
280 }
281
282 pkt2->pktsched_plen = pkt1->pktsched_plen;
283 pkt2->pktsched_ptype = pkt1->pktsched_ptype;
284 pkt2->pktsched_tail = pkt2->pktsched_pkt;
285 pkt2->pktsched_pcnt = 1;
286 return 0;
287 }
288
289 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)290 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
291 {
292 struct mbuf *m = NULL;
293 uint8_t *data = NULL;
294 uint32_t data_len = 0;
295 uint32_t rand32, rand_off, rand_bit;
296 #if SKYWALK
297 struct __kern_packet *p = NULL;
298 #endif /* SKYWALK */
299
300 switch (pkt->pktsched_ptype) {
301 case QP_MBUF:
302 m = pkt->pktsched_pkt_mbuf;
303 data = mtod(m, uint8_t *);
304 data_len = m->m_pkthdr.len;
305 break;
306 #if SKYWALK
307 case QP_PACKET:
308 p = pkt->pktsched_pkt_kpkt;
309 if (p->pkt_pflags & PKT_F_MBUF_DATA) {
310 m = p->pkt_mbuf;
311 data = mtod(m, uint8_t *);
312 data_len = m->m_pkthdr.len;
313 } else {
314 MD_BUFLET_ADDR_DLEN(p, data, data_len);
315 }
316 break;
317 #endif /* SKYWALK */
318
319 default:
320 /* NOTREACHED */
321 VERIFY(0);
322 __builtin_unreachable();
323 }
324
325 read_frandom(&rand32, sizeof(rand32));
326 rand_bit = rand32 & 0x8;
327 rand_off = (rand32 >> 3) % data_len;
328 data[rand_off] ^= 1 << rand_bit;
329 }
330
331 void
pktsched_free_pkt(pktsched_pkt_t * pkt)332 pktsched_free_pkt(pktsched_pkt_t *pkt)
333 {
334 uint32_t cnt = pkt->pktsched_pcnt;
335 ASSERT(cnt != 0);
336
337 switch (pkt->pktsched_ptype) {
338 case QP_MBUF: {
339 struct mbuf *m;
340
341 m = pkt->pktsched_pkt_mbuf;
342 if (cnt == 1) {
343 VERIFY(m->m_nextpkt == NULL);
344 } else {
345 VERIFY(m->m_nextpkt != NULL);
346 }
347 m_freem_list(m);
348 break;
349 }
350 #if SKYWALK
351 case QP_PACKET: {
352 struct __kern_packet *kpkt;
353 int pcnt = 0;
354
355 kpkt = pkt->pktsched_pkt_kpkt;
356 if (cnt == 1) {
357 VERIFY(kpkt->pkt_nextpkt == NULL);
358 } else {
359 VERIFY(kpkt->pkt_nextpkt != NULL);
360 }
361 pp_free_packet_chain(kpkt, &pcnt);
362 VERIFY(cnt == (uint32_t)pcnt);
363 break;
364 }
365 #endif /* SKYWALK */
366
367 default:
368 VERIFY(0);
369 /* NOTREACHED */
370 __builtin_unreachable();
371 }
372 pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
373 pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
374 pkt->pktsched_plen = 0;
375 pkt->pktsched_pcnt = 0;
376 }
377
378 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)379 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
380 {
381 mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
382
383 switch (pkt->pktsched_ptype) {
384 case QP_MBUF:
385 svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
386 break;
387
388 #if SKYWALK
389 case QP_PACKET:
390 svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
391 break;
392 #endif /* SKYWALK */
393
394 default:
395 VERIFY(0);
396 /* NOTREACHED */
397 __builtin_unreachable();
398 }
399
400 return svc;
401 }
402
403 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt)404 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
405 uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
406 uint32_t *comp_gencnt)
407 {
408 switch (pkt->pktsched_ptype) {
409 case QP_MBUF: {
410 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
411
412 if (flags != NULL) {
413 *flags = &pkth->pkt_flags;
414 }
415 if (timestamp != NULL) {
416 *timestamp = &pkth->pkt_timestamp;
417 }
418 if (flowid != NULL) {
419 *flowid = pkth->pkt_flowid;
420 }
421 if (flowsrc != NULL) {
422 *flowsrc = pkth->pkt_flowsrc;
423 }
424 if (proto != NULL) {
425 *proto = pkth->pkt_proto;
426 }
427 if (comp_gencnt != NULL) {
428 *comp_gencnt = pkth->comp_gencnt;
429 }
430
431 break;
432 }
433
434 #if SKYWALK
435 case QP_PACKET: {
436 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
437
438 if (flags != NULL) {
439 /* use lower-32 bit for common flags */
440 *flags = &kp->pkt_pflags32;
441 }
442 if (timestamp != NULL) {
443 *timestamp = &kp->pkt_timestamp;
444 }
445 if (flowid != NULL) {
446 *flowid = kp->pkt_flow_token;
447 }
448 if (flowsrc != NULL) {
449 *flowsrc = (uint8_t)kp->pkt_flowsrc_type;
450 }
451 if (proto != NULL) {
452 *proto = kp->pkt_transport_protocol;
453 }
454 if (comp_gencnt != NULL) {
455 *comp_gencnt = kp->pkt_comp_gencnt;
456 }
457
458 break;
459 }
460 #endif /* SKYWALK */
461
462 default:
463 VERIFY(0);
464 /* NOTREACHED */
465 __builtin_unreachable();
466 }
467 }
468
469 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)470 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
471 {
472 #pragma unused(ifp)
473 struct flowadv_fcentry *fce = NULL;
474
475 switch (pkt->pktsched_ptype) {
476 case QP_MBUF: {
477 struct mbuf *m = pkt->pktsched_pkt_mbuf;
478
479 fce = flowadv_alloc_entry(how);
480 if (fce == NULL) {
481 break;
482 }
483
484 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
485 sizeof(fce->fce_flowid));
486
487 fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
488 fce->fce_flowid = m->m_pkthdr.pkt_flowid;
489 #if SKYWALK
490 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
491 sizeof(fce->fce_flowsrc_token));
492 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
493 sizeof(fce->fce_flowsrc_fidx));
494
495 if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
496 fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
497 fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
498 fce->fce_ifp = ifp;
499 }
500 #endif /* SKYWALK */
501 break;
502 }
503
504 #if SKYWALK
505 case QP_PACKET: {
506 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
507
508 fce = flowadv_alloc_entry(how);
509 if (fce == NULL) {
510 break;
511 }
512
513 _CASSERT(sizeof(fce->fce_flowid) ==
514 sizeof(kp->pkt_flow_token));
515 _CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
516 sizeof(kp->pkt_flowsrc_fidx));
517 _CASSERT(sizeof(fce->fce_flowsrc_token) ==
518 sizeof(kp->pkt_flowsrc_token));
519
520 ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
521 fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
522 fce->fce_flowid = kp->pkt_flow_token;
523 fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
524 fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
525 fce->fce_ifp = ifp;
526 break;
527 }
528 #endif /* SKYWALK */
529
530 default:
531 VERIFY(0);
532 /* NOTREACHED */
533 __builtin_unreachable();
534 }
535
536 return fce;
537 }
538
539 uint32_t *
pktsched_get_pkt_sfb_vars(pktsched_pkt_t * pkt,uint32_t ** sfb_flags)540 pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
541 {
542 uint32_t *hashp = NULL;
543
544 switch (pkt->pktsched_ptype) {
545 case QP_MBUF: {
546 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
547
548 _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
549 _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
550 *sfb_flags = &pkth->pkt_mpriv_flags;
551 hashp = &pkth->pkt_mpriv_hash;
552 break;
553 }
554
555 #if SKYWALK
556 case QP_PACKET: {
557 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
558
559 _CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
560 _CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
561 *sfb_flags = &kp->pkt_classq_flags;
562 hashp = &kp->pkt_classq_hash;
563 break;
564 }
565 #endif /* SKYWALK */
566
567 default:
568 VERIFY(0);
569 /* NOTREACHED */
570 __builtin_unreachable();
571 }
572
573 return hashp;
574 }
575
576 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)577 pktsched_mbuf_mark_ecn(struct mbuf* m)
578 {
579 struct mbuf *m0;
580 void *hdr;
581 int af;
582 uint8_t ipv;
583
584 hdr = m->m_pkthdr.pkt_hdr;
585 /* verify that hdr is within the mbuf data */
586 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
587 if (((caddr_t)hdr >= m0->m_data) &&
588 ((caddr_t)hdr < m0->m_data + m0->m_len)) {
589 break;
590 }
591 }
592 if (m0 == NULL) {
593 return EINVAL;
594 }
595 ipv = IP_VHL_V(*(uint8_t *)hdr);
596 if (ipv == 4) {
597 af = AF_INET;
598 } else if (ipv == 6) {
599 af = AF_INET6;
600 } else {
601 af = AF_UNSPEC;
602 }
603
604 switch (af) {
605 case AF_INET: {
606 struct ip *ip = hdr;
607 uint8_t otos;
608 int sum;
609
610 if (((uintptr_t)ip + sizeof(*ip)) >
611 ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
612 return EINVAL; /* out of bounds */
613 }
614 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
615 return EINVAL; /* not-ECT */
616 }
617 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
618 return 0; /* already marked */
619 }
620 /*
621 * ecn-capable but not marked,
622 * mark CE and update checksum
623 */
624 otos = ip->ip_tos;
625 ip->ip_tos |= IPTOS_ECN_CE;
626 /*
627 * update checksum (from RFC1624) only if hw
628 * checksum is not supported.
629 * HC' = ~(~HC + ~m + m')
630 */
631 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
632 sum = ~ntohs(ip->ip_sum) & 0xffff;
633 sum += (~otos & 0xffff) + ip->ip_tos;
634 sum = (sum >> 16) + (sum & 0xffff);
635 sum += (sum >> 16); /* add carry */
636 ip->ip_sum = htons(~sum & 0xffff);
637 }
638 return 0;
639 }
640 case AF_INET6: {
641 struct ip6_hdr *ip6 = hdr;
642 u_int32_t flowlabel;
643
644 if (((uintptr_t)ip6 + sizeof(*ip6)) >
645 ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
646 return EINVAL; /* out of bounds */
647 }
648 flowlabel = ntohl(ip6->ip6_flow);
649 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
650 (IPTOS_ECN_NOTECT << 20)) {
651 return EINVAL; /* not-ECT */
652 }
653 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
654 (IPTOS_ECN_CE << 20)) {
655 return 0; /* already marked */
656 }
657 /*
658 * ecn-capable but not marked, mark CE
659 */
660 flowlabel |= (IPTOS_ECN_CE << 20);
661 ip6->ip6_flow = htonl(flowlabel);
662 return 0;
663 }
664 default:
665 return EPROTONOSUPPORT;
666 }
667 }
668
669 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)670 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
671 {
672 uint8_t ipv = 0, *l3_hdr;
673
674 if (__improbable((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
675 ipv = kpkt->pkt_flow_ip_ver;
676 l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr;
677 } else {
678 uint8_t *pkt_buf;
679 uint16_t bdlen, bdlim, bdoff;
680 MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
681
682 /* takes care of both IPv4 and IPv6 */
683 l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
684 ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
685 if (ipv == 4) {
686 ipv = IPVERSION;
687 } else if (ipv == 6) {
688 ipv = IPV6_VERSION;
689 } else {
690 ipv = 0;
691 }
692 }
693
694 switch (ipv) {
695 case IPVERSION: {
696 uint8_t otos;
697 int sum;
698
699 struct ip *ip = (struct ip *)(void *)l3_hdr;
700 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
701 return EINVAL; /* not-ECT */
702 }
703 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
704 return 0; /* already marked */
705 }
706 /*
707 * ecn-capable but not marked,
708 * mark CE and update checksum
709 */
710 otos = ip->ip_tos;
711 ip->ip_tos |= IPTOS_ECN_CE;
712
713 sum = ~ntohs(ip->ip_sum) & 0xffff;
714 sum += (~otos & 0xffff) + ip->ip_tos;
715 sum = (sum >> 16) + (sum & 0xffff);
716 sum += (sum >> 16); /* add carry */
717 ip->ip_sum = htons(~sum & 0xffff);
718
719 return 0;
720 }
721 case IPV6_VERSION: {
722 struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
723 u_int32_t flowlabel;
724 flowlabel = ntohl(ip6->ip6_flow);
725 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
726 (IPTOS_ECN_NOTECT << 20)) {
727 return EINVAL; /* not-ECT */
728 }
729 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
730 (IPTOS_ECN_CE << 20)) {
731 return 0; /* already marked */
732 }
733 /*
734 * ecn-capable but not marked, mark CE
735 */
736 flowlabel |= (IPTOS_ECN_CE << 20);
737 ip6->ip6_flow = htonl(flowlabel);
738
739 return 0;
740 }
741 default:
742 return EPROTONOSUPPORT;
743 }
744 }
745
746 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)747 pktsched_mark_ecn(pktsched_pkt_t *pkt)
748 {
749 switch (pkt->pktsched_ptype) {
750 case QP_MBUF:
751 return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
752 case QP_PACKET:
753 return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
754 default:
755 VERIFY(0);
756 /* NOTREACHED */
757 __builtin_unreachable();
758 }
759 }
760
761 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)762 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
763 {
764 switch (pkt->pktsched_ptype) {
765 case QP_MBUF: {
766 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
767 return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
768 }
769 case QP_PACKET: {
770 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
771 return (kp->pkt_pflags & PKT_F_L4S) != 0;
772 }
773
774 default:
775 VERIFY(0);
776 /* NOTREACHED */
777 __builtin_unreachable();
778 }
779 return FALSE;
780 }
781