1 /*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/cdefs.h>
30
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39
40 #include <dev/random/randomdev.h>
41 #include <net/droptap.h>
42 #include <net/if.h>
43 #include <net/if_var.h>
44 #include <net/if_dl.h>
45 #include <net/if_types.h>
46 #include <net/net_osdep.h>
47 #include <net/droptap.h>
48 #include <net/pktsched/pktsched.h>
49 #include <net/pktsched/pktsched_ops.h>
50 #include <net/pktsched/pktsched_fq_codel.h>
51 #include <net/pktsched/pktsched_netem.h>
52
53 #define _IP_VHL
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56
57 #include <pexpert/pexpert.h>
58
59 #if SKYWALK
60 #include <skywalk/os_skywalk_private.h>
61 #endif /* SKYWALK */
62
63 u_int32_t machclk_freq = 0;
64 u_int64_t machclk_per_sec = 0;
65 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
66
67 static void init_machclk(void);
68
69 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
70
71 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
72 &pktsched_verbose, 0, "Packet scheduler verbosity level");
73
74 static void
pktsched_teardown_noop(__unused struct ifclassq * ifq)75 pktsched_teardown_noop(__unused struct ifclassq *ifq)
76 {
77 return;
78 }
79
80 static int
pktsched_request_noop(struct ifclassq * ifq,cqrq_t rq,void * arg)81 pktsched_request_noop(struct ifclassq *ifq, cqrq_t rq, void *arg)
82 {
83 #pragma unused(ifq, rq, arg)
84 return ENXIO;
85 }
86
87 static int
pktsched_getqstats_noop(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)88 pktsched_getqstats_noop(struct ifclassq *ifq,
89 uint8_t gid, u_int32_t qid,
90 struct if_ifclassq_stats *ifqs)
91 {
92 #pragma unused(ifq, gid, qid, ifqs)
93 return ENXIO;
94 }
95
96 static int
pktsched_enqueue_noop(struct ifclassq * ifq,classq_pkt_t * h,classq_pkt_t * t,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)97 pktsched_enqueue_noop(struct ifclassq *ifq,
98 classq_pkt_t *h, classq_pkt_t *t, uint32_t cnt,
99 uint32_t bytes, boolean_t *pdrop)
100 {
101 pktsched_pkt_t pkt;
102 pktsched_pkt_encap_chain(&pkt, h, t, cnt, bytes);
103 if (__improbable(droptap_verbose > 0)) {
104 pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
105 __func__, __LINE__, 0);
106 } else {
107 pktsched_free_pkt(&pkt);
108 }
109
110 *pdrop = true;
111 return ENXIO;
112 }
113
114 static int
pktsched_dequeue_noop(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)115 pktsched_dequeue_noop(struct ifclassq *ifq,
116 u_int32_t maxpktcnt, u_int32_t maxbytecnt,
117 classq_pkt_t *first_packet, classq_pkt_t *last_packet,
118 u_int32_t *retpktcnt, u_int32_t *retbytecnt,
119 uint8_t grp_idx)
120 {
121 #pragma unused(ifq, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx)
122 return ENXIO;
123 }
124
125 static int
pktsched_dequeue_sc_noop(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)126 pktsched_dequeue_sc_noop(struct ifclassq *ifq,
127 mbuf_svc_class_t svc, u_int32_t maxpktcnt,
128 u_int32_t maxbytecnt, classq_pkt_t *first_packet,
129 classq_pkt_t *last_packet, u_int32_t *retpktcnt,
130 u_int32_t *retbytecnt, uint8_t grp_idx)
131 {
132 #pragma unused(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx)
133 return ENXIO;
134 }
135
136 static int
pktsched_setup_noop(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)137 pktsched_setup_noop(struct ifclassq *ifq, u_int32_t flags,
138 classq_pkt_type_t ptype)
139 {
140 #pragma unused(ifq, flags, ptype)
141 return ENXIO;
142 }
143
144 static boolean_t
pktsched_allow_dequeue_noop(struct ifclassq * ifq)145 pktsched_allow_dequeue_noop(struct ifclassq *ifq)
146 {
147 #pragma unused(ifq)
148 return false;
149 }
150
151 struct pktsched_ops pktsched_noops = {
152 .ps_id = PKTSCHEDT_NONE,
153 .ps_setup = pktsched_setup_noop,
154 .ps_teardown = pktsched_teardown_noop,
155 .ps_enq = pktsched_enqueue_noop,
156 .ps_deq = pktsched_dequeue_noop,
157 .ps_deq_sc = pktsched_dequeue_sc_noop,
158 .ps_req = pktsched_request_noop,
159 .ps_stats = pktsched_getqstats_noop,
160 .ps_allow_dequeue = pktsched_allow_dequeue_noop,
161 };
162
163 void
pktsched_init(void)164 pktsched_init(void)
165 {
166 init_machclk();
167 if (machclk_freq == 0) {
168 panic("%s: no CPU clock available!", __func__);
169 /* NOTREACHED */
170 }
171 pktsched_ops_register(&pktsched_noops);
172 pktsched_fq_init();
173 }
174
175 static void
init_machclk(void)176 init_machclk(void)
177 {
178 /*
179 * Initialize machclk_freq using the timerbase frequency
180 * value from device specific info.
181 */
182 machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
183
184 clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
185 &machclk_per_sec);
186 }
187
188 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)189 pktsched_abs_to_nsecs(u_int64_t abstime)
190 {
191 u_int64_t nsecs;
192
193 absolutetime_to_nanoseconds(abstime, &nsecs);
194 return nsecs;
195 }
196
197 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)198 pktsched_nsecs_to_abstime(u_int64_t nsecs)
199 {
200 u_int64_t abstime;
201
202 nanoseconds_to_absolutetime(nsecs, &abstime);
203 return abstime;
204 }
205
206 int
pktsched_setup(struct ifclassq * ifq,u_int8_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)207 pktsched_setup(struct ifclassq *ifq, u_int8_t scheduler, u_int32_t sflags,
208 classq_pkt_type_t ptype)
209 {
210 int error = 0;
211 u_int32_t rflags;
212 pktsched_ops_t *ops;
213
214 IFCQ_LOCK_ASSERT_HELD(ifq);
215
216 VERIFY(machclk_freq != 0);
217
218 /* Nothing to do unless the scheduler type changes */
219 if (ifq->ifcq_type == scheduler) {
220 return 0;
221 }
222
223 /*
224 * Remember the flags that need to be restored upon success, as
225 * they may be cleared when we tear down existing scheduler.
226 */
227 rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
228
229 if (ifq->ifcq_type != PKTSCHEDT_NONE) {
230 /* Don't support changing qdisc for fq_codel that has multiple groups */
231 if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
232 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
233 uint8_t grp_idx;
234 for (grp_idx = 1; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
235 if (fqs->fqs_classq_groups[grp_idx] != NULL) {
236 return ENOTSUP;
237 }
238 }
239 }
240
241 pktsched_teardown(ifq);
242
243 /* Teardown should have succeeded */
244 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
245 VERIFY(ifq->ifcq_disc == NULL);
246 }
247
248 ops = pktsched_ops_find(scheduler);
249 ASSERT(ops != NULL);
250 ifq->ifcq_ops = ops;
251 error = ops->ps_setup(ifq, sflags, ptype);
252 if (error == 0) {
253 ifq->ifcq_flags |= rflags;
254 }
255 if (ops->ps_ops_flags & PKTSCHED_OPS_LOCKLESS) {
256 ifq->ifcq_flags |= IFCQF_LOCKLESS;
257 }
258
259 return error;
260 }
261
262 void
pktsched_teardown(struct ifclassq * ifq)263 pktsched_teardown(struct ifclassq *ifq)
264 {
265 IFCQ_LOCK_ASSERT_HELD(ifq);
266 ifq->ifcq_ops->ps_req(ifq, CLASSQRQ_PURGE, 0);
267 VERIFY(IFCQ_IS_EMPTY(ifq));
268 ifq->ifcq_flags &= ~IFCQF_ENABLED;
269 ifq->ifcq_ops->ps_teardown(ifq);
270 return;
271 }
272
273 // TODO: change function signature to be more generic
274 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)275 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
276 struct if_ifclassq_stats *ifqs)
277 {
278 IFCQ_LOCK_ASSERT_HELD(ifq);
279
280 return ifq->ifcq_ops->ps_stats(ifq, (uint8_t)gid, qid, ifqs);
281 }
282
283 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)284 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
285 {
286 pkt->pktsched_pkt = *cpkt;
287 pkt->pktsched_tail = *cpkt;
288 pkt->pktsched_pcnt = 1;
289
290 switch (cpkt->cp_ptype) {
291 case QP_MBUF:
292 pkt->pktsched_plen =
293 (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
294 break;
295
296 #if SKYWALK
297 case QP_PACKET:
298 pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
299 break;
300 #endif /* SKYWALK */
301
302 default:
303 VERIFY(0);
304 /* NOTREACHED */
305 __builtin_unreachable();
306 }
307 }
308
309 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)310 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
311 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
312 {
313 pkt->pktsched_pkt = *cpkt;
314 pkt->pktsched_tail = *tail;
315 pkt->pktsched_pcnt = cnt;
316 pkt->pktsched_plen = bytes;
317
318 switch (cpkt->cp_ptype) {
319 case QP_MBUF:
320 break;
321
322 #if SKYWALK
323 case QP_PACKET:
324 break;
325 #endif /* SKYWALK */
326
327 default:
328 VERIFY(0);
329 /* NOTREACHED */
330 __builtin_unreachable();
331 }
332 }
333
334 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)335 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
336 {
337 struct mbuf *m1, *m2;
338 #if SKYWALK
339 struct __kern_packet *p1;
340 kern_packet_t ph2;
341 int err;
342 #endif /* SKYWALK */
343
344 ASSERT(pkt1 != NULL);
345 ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
346 ASSERT(pkt1->pktsched_pcnt == 1);
347
348 /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
349 ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
350 pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
351 pkt2->pktsched_pkt_mbuf == NULL));
352
353 switch (pkt1->pktsched_ptype) {
354 case QP_MBUF:
355 m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
356 m2 = m_dup(m1, M_NOWAIT);
357 if (__improbable(m2 == NULL)) {
358 return ENOBUFS;
359 }
360 pkt2->pktsched_pkt_mbuf = m2;
361 break;
362
363 #if SKYWALK
364 case QP_PACKET:
365 p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
366 err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
367 METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
368 KPKT_COPY_HEAVY);
369 if (__improbable(err != 0)) {
370 return err;
371 }
372 ASSERT(ph2 != 0);
373 VERIFY(kern_packet_finalize(ph2) == 0);
374 pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
375 break;
376 #endif /* SKYWALK */
377
378 default:
379 VERIFY(0);
380 /* NOTREACHED */
381 __builtin_unreachable();
382 }
383
384 pkt2->pktsched_plen = pkt1->pktsched_plen;
385 pkt2->pktsched_ptype = pkt1->pktsched_ptype;
386 pkt2->pktsched_tail = pkt2->pktsched_pkt;
387 pkt2->pktsched_pcnt = 1;
388 return 0;
389 }
390
391 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)392 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
393 {
394 struct mbuf *m = NULL;
395 uint8_t *data = NULL;
396 uint32_t data_len = 0;
397 uint32_t rand32, rand_off, rand_bit;
398 #if SKYWALK
399 struct __kern_packet *p = NULL;
400 #endif /* SKYWALK */
401
402 switch (pkt->pktsched_ptype) {
403 case QP_MBUF:
404 m = pkt->pktsched_pkt_mbuf;
405 data = mtod(m, uint8_t *);
406 data_len = m->m_pkthdr.len;
407 break;
408 #if SKYWALK
409 case QP_PACKET:
410 p = pkt->pktsched_pkt_kpkt;
411 if (p->pkt_pflags & PKT_F_MBUF_DATA) {
412 m = p->pkt_mbuf;
413 data = mtod(m, uint8_t *);
414 data_len = m->m_pkthdr.len;
415 } else {
416 MD_BUFLET_ADDR_DLEN(p, data, data_len);
417 }
418 break;
419 #endif /* SKYWALK */
420
421 default:
422 /* NOTREACHED */
423 VERIFY(0);
424 __builtin_unreachable();
425 }
426
427 read_frandom(&rand32, sizeof(rand32));
428 rand_bit = rand32 & 0x7;
429 rand_off = (rand32 >> 3) % data_len;
430 data[rand_off] ^= (uint8_t)(1 << rand_bit);
431 }
432
433 void
pktsched_free_pkt(pktsched_pkt_t * pkt)434 pktsched_free_pkt(pktsched_pkt_t *pkt)
435 {
436 uint32_t cnt = pkt->pktsched_pcnt;
437 ASSERT(cnt != 0);
438
439 switch (pkt->pktsched_ptype) {
440 case QP_MBUF: {
441 struct mbuf *m;
442
443 m = pkt->pktsched_pkt_mbuf;
444 if (cnt == 1) {
445 VERIFY(m->m_nextpkt == NULL);
446 } else {
447 VERIFY(m->m_nextpkt != NULL);
448 }
449 m_freem_list(m);
450 break;
451 }
452 #if SKYWALK
453 case QP_PACKET: {
454 struct __kern_packet *kpkt;
455 int pcnt = 0;
456
457 kpkt = pkt->pktsched_pkt_kpkt;
458 if (cnt == 1) {
459 VERIFY(kpkt->pkt_nextpkt == NULL);
460 } else {
461 VERIFY(kpkt->pkt_nextpkt != NULL);
462 }
463 pp_free_packet_chain(kpkt, &pcnt);
464 VERIFY(cnt == (uint32_t)pcnt);
465 break;
466 }
467 #endif /* SKYWALK */
468
469 default:
470 VERIFY(0);
471 /* NOTREACHED */
472 __builtin_unreachable();
473 }
474 pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
475 pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
476 pkt->pktsched_plen = 0;
477 pkt->pktsched_pcnt = 0;
478 }
479
480 void
pktsched_drop_pkt(pktsched_pkt_t * pkt,struct ifnet * ifp,drop_reason_t reason,const char * funcname,uint16_t linenum,uint16_t flags)481 pktsched_drop_pkt(pktsched_pkt_t *pkt, struct ifnet *ifp, drop_reason_t reason, const char *funcname,
482 uint16_t linenum, uint16_t flags)
483 {
484 if (__probable(droptap_total_tap_count == 0)) {
485 pktsched_free_pkt(pkt);
486 return;
487 }
488
489 uint32_t cnt = pkt->pktsched_pcnt;
490 ASSERT(cnt != 0);
491
492 switch (pkt->pktsched_ptype) {
493 case QP_MBUF: {
494 struct mbuf *m;
495
496 m = pkt->pktsched_pkt_mbuf;
497 if (cnt == 1) {
498 VERIFY(m->m_nextpkt == NULL);
499 } else {
500 VERIFY(m->m_nextpkt != NULL);
501 }
502 m_drop_list(m, ifp, flags | DROPTAP_FLAG_DIR_OUT, reason, funcname, linenum);
503 break;
504 }
505 #if SKYWALK
506 case QP_PACKET: {
507 struct __kern_packet *kpkt;
508
509 kpkt = pkt->pktsched_pkt_kpkt;
510 if (cnt == 1) {
511 VERIFY(kpkt->pkt_nextpkt == NULL);
512 } else {
513 VERIFY(kpkt->pkt_nextpkt != NULL);
514 }
515 droptap_output_packet(SK_PKT2PH(kpkt), reason, funcname, linenum,
516 flags, ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
517 pktsched_free_pkt(pkt);
518 break;
519 }
520 #endif /* SKYWALK */
521
522 default:
523 VERIFY(0);
524 /* NOTREACHED */
525 __builtin_unreachable();
526 }
527 }
528
529 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)530 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
531 {
532 mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
533
534 switch (pkt->pktsched_ptype) {
535 case QP_MBUF:
536 svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
537 break;
538
539 #if SKYWALK
540 case QP_PACKET:
541 svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
542 break;
543 #endif /* SKYWALK */
544
545 default:
546 VERIFY(0);
547 /* NOTREACHED */
548 __builtin_unreachable();
549 }
550
551 return svc;
552 }
553
554 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt,uint64_t * pkt_tx_time)555 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
556 uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
557 uint32_t *comp_gencnt, uint64_t *pkt_tx_time)
558 {
559 switch (pkt->pktsched_ptype) {
560 case QP_MBUF: {
561 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
562
563 if (flags != NULL) {
564 *flags = &pkth->pkt_flags;
565 }
566 if (timestamp != NULL) {
567 *timestamp = &pkth->pkt_timestamp;
568 }
569 if (flowid != NULL) {
570 *flowid = pkth->pkt_flowid;
571 }
572 if (flowsrc != NULL) {
573 *flowsrc = pkth->pkt_flowsrc;
574 }
575 if (proto != NULL) {
576 /*
577 * rdar://100524205 - We want to use the pkt_ext_flags
578 * to denote QUIC packets, but AQM is already written in
579 * such a way where IPPROTO_QUIC is used to denote QUIC
580 * packets.
581 */
582 if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
583 *proto = IPPROTO_QUIC;
584 } else {
585 *proto = pkth->pkt_proto;
586 }
587 }
588 if (comp_gencnt != NULL) {
589 *comp_gencnt = pkth->comp_gencnt;
590 }
591 if (pkt_tx_time != NULL) {
592 struct m_tag *tag;
593 tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID,
594 KERNEL_TAG_TYPE_AQM);
595 if (__improbable(tag != NULL)) {
596 *pkt_tx_time = *(uint64_t *)tag->m_tag_data;
597 } else {
598 *pkt_tx_time = 0;
599 }
600 }
601
602 break;
603 }
604
605 #if SKYWALK
606 case QP_PACKET: {
607 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
608
609 if (flags != NULL) {
610 /* use lower-32 bit for common flags */
611 *flags = &kp->pkt_pflags32;
612 }
613 if (timestamp != NULL) {
614 *timestamp = &kp->pkt_timestamp;
615 }
616 if (flowid != NULL) {
617 *flowid = kp->pkt_flow_token;
618 }
619 if (flowsrc != NULL) {
620 *flowsrc = (uint8_t)kp->pkt_flowsrc_type;
621 }
622 if (proto != NULL) {
623 *proto = kp->pkt_transport_protocol;
624 }
625 if (comp_gencnt != NULL) {
626 *comp_gencnt = kp->pkt_comp_gencnt;
627 }
628 if (pkt_tx_time != NULL) {
629 *pkt_tx_time = __packet_get_tx_timestamp(SK_PKT2PH(kp));
630 }
631
632 break;
633 }
634 #endif /* SKYWALK */
635
636 default:
637 VERIFY(0);
638 /* NOTREACHED */
639 __builtin_unreachable();
640 }
641 }
642
643 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)644 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
645 {
646 #pragma unused(ifp)
647 struct flowadv_fcentry *fce = NULL;
648
649 switch (pkt->pktsched_ptype) {
650 case QP_MBUF: {
651 struct mbuf *m = pkt->pktsched_pkt_mbuf;
652
653 fce = flowadv_alloc_entry(how);
654 if (fce == NULL) {
655 break;
656 }
657
658 static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(fce->fce_flowid));
659
660 fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
661 fce->fce_flowid = m->m_pkthdr.pkt_flowid;
662 #if SKYWALK
663 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(fce->fce_flowsrc_token));
664 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(fce->fce_flowsrc_fidx));
665
666 if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
667 fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
668 fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
669 fce->fce_ifp = ifp;
670 }
671 #endif /* SKYWALK */
672 break;
673 }
674
675 #if SKYWALK
676 case QP_PACKET: {
677 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
678
679 fce = flowadv_alloc_entry(how);
680 if (fce == NULL) {
681 break;
682 }
683
684 static_assert(sizeof(fce->fce_flowid) == sizeof(kp->pkt_flow_token));
685 static_assert(sizeof(fce->fce_flowsrc_fidx) == sizeof(kp->pkt_flowsrc_fidx));
686 static_assert(sizeof(fce->fce_flowsrc_token) == sizeof(kp->pkt_flowsrc_token));
687
688 ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
689 fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
690 fce->fce_flowid = kp->pkt_flow_token;
691 fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
692 fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
693 fce->fce_ifp = ifp;
694 break;
695 }
696 #endif /* SKYWALK */
697
698 default:
699 VERIFY(0);
700 /* NOTREACHED */
701 __builtin_unreachable();
702 }
703
704 return fce;
705 }
706
707 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)708 pktsched_mbuf_mark_ecn(struct mbuf* m)
709 {
710 struct mbuf *m0;
711 void *__single hdr;
712 int af;
713 uint8_t ipv;
714
715 hdr = m->m_pkthdr.pkt_hdr;
716 /* verify that hdr is within the mbuf data */
717 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
718 if (((caddr_t)hdr >= m_mtod_current(m0)) &&
719 ((caddr_t)hdr < m_mtod_current(m0) + m0->m_len)) {
720 break;
721 }
722 }
723 if (m0 == NULL) {
724 return EINVAL;
725 }
726 ipv = IP_VHL_V(*(uint8_t *)hdr);
727 if (ipv == 4) {
728 af = AF_INET;
729 } else if (ipv == 6) {
730 af = AF_INET6;
731 } else {
732 af = AF_UNSPEC;
733 }
734
735 switch (af) {
736 case AF_INET: {
737 struct ip *__single ip = (struct ip *)(void *)hdr;
738 uint8_t otos;
739 int sum;
740
741 if (((uintptr_t)ip + sizeof(*ip)) >
742 ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
743 return EINVAL; /* out of bounds */
744 }
745 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
746 return EINVAL; /* not-ECT */
747 }
748 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
749 return 0; /* already marked */
750 }
751 /*
752 * ecn-capable but not marked,
753 * mark CE and update checksum
754 */
755 otos = ip->ip_tos;
756 ip->ip_tos |= IPTOS_ECN_CE;
757 /*
758 * update checksum (from RFC1624) only if hw
759 * checksum is not supported.
760 * HC' = ~(~HC + ~m + m')
761 */
762 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
763 sum = ~ntohs(ip->ip_sum) & 0xffff;
764 sum += (~otos & 0xffff) + ip->ip_tos;
765 sum = (sum >> 16) + (sum & 0xffff);
766 sum += (sum >> 16); /* add carry */
767 ip->ip_sum = htons(~sum & 0xffff);
768 }
769 return 0;
770 }
771 case AF_INET6: {
772 struct ip6_hdr *__single ip6 = (struct ip6_hdr *)(void *)hdr;
773 u_int32_t flowlabel;
774
775 if (((uintptr_t)ip6 + sizeof(*ip6)) >
776 ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
777 return EINVAL; /* out of bounds */
778 }
779 flowlabel = ntohl(ip6->ip6_flow);
780 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
781 (IPTOS_ECN_NOTECT << 20)) {
782 return EINVAL; /* not-ECT */
783 }
784 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
785 (IPTOS_ECN_CE << 20)) {
786 return 0; /* already marked */
787 }
788 /*
789 * ecn-capable but not marked, mark CE
790 */
791 flowlabel |= (IPTOS_ECN_CE << 20);
792 ip6->ip6_flow = htonl(flowlabel);
793 return 0;
794 }
795 default:
796 return EPROTONOSUPPORT;
797 }
798 }
799
800 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)801 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
802 {
803 uint8_t ipv = 0, *l3_hdr;
804
805 if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) {
806 uint32_t l3_len = 0;
807 ipv = kpkt->pkt_flow_ip_ver;
808 l3_len = kpkt->pkt_length - kpkt->pkt_l2_len;
809 l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *, kpkt->pkt_flow_ip_hdr, l3_len);
810 } else {
811 uint8_t *pkt_buf;
812 uint32_t bdlen, bdlim, bdoff;
813 MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
814
815 /* takes care of both IPv4 and IPv6 */
816 l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
817 ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
818 if (ipv == 4) {
819 ipv = IPVERSION;
820 } else if (ipv == 6) {
821 ipv = IPV6_VERSION;
822 } else {
823 ipv = 0;
824 }
825 }
826
827 switch (ipv) {
828 case IPVERSION: {
829 uint8_t otos;
830 int sum;
831
832 struct ip *ip = (struct ip *)(void *)l3_hdr;
833 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
834 return EINVAL; /* not-ECT */
835 }
836 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
837 return 0; /* already marked */
838 }
839 /*
840 * ecn-capable but not marked,
841 * mark CE and update checksum
842 */
843 otos = ip->ip_tos;
844 ip->ip_tos |= IPTOS_ECN_CE;
845
846 sum = ~ntohs(ip->ip_sum) & 0xffff;
847 sum += (~otos & 0xffff) + ip->ip_tos;
848 sum = (sum >> 16) + (sum & 0xffff);
849 sum += (sum >> 16); /* add carry */
850 ip->ip_sum = htons(~sum & 0xffff);
851
852 return 0;
853 }
854 case IPV6_VERSION: {
855 struct ip6_hdr *ip6 = (struct ip6_hdr *)(void *)l3_hdr;
856 u_int32_t flowlabel;
857 flowlabel = ntohl(ip6->ip6_flow);
858 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
859 (IPTOS_ECN_NOTECT << 20)) {
860 return EINVAL; /* not-ECT */
861 }
862 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
863 (IPTOS_ECN_CE << 20)) {
864 return 0; /* already marked */
865 }
866 /*
867 * ecn-capable but not marked, mark CE
868 */
869 flowlabel |= (IPTOS_ECN_CE << 20);
870 ip6->ip6_flow = htonl(flowlabel);
871
872 return 0;
873 }
874 default:
875 return EPROTONOSUPPORT;
876 }
877 }
878
879 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)880 pktsched_mark_ecn(pktsched_pkt_t *pkt)
881 {
882 switch (pkt->pktsched_ptype) {
883 case QP_MBUF:
884 return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
885 case QP_PACKET:
886 return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
887 default:
888 VERIFY(0);
889 /* NOTREACHED */
890 __builtin_unreachable();
891 }
892 }
893
894 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)895 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
896 {
897 switch (pkt->pktsched_ptype) {
898 case QP_MBUF: {
899 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
900 return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
901 }
902 case QP_PACKET: {
903 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
904 return (kp->pkt_pflags & PKT_F_L4S) != 0;
905 }
906
907 default:
908 VERIFY(0);
909 /* NOTREACHED */
910 __builtin_unreachable();
911 }
912 return FALSE;
913 }
914
915 struct aqm_tag_container {
916 struct m_tag aqm_m_tag;
917 uint64_t aqm_tag;
918 };
919
920 static struct m_tag *
m_tag_kalloc_aqm(u_int32_t id,u_int16_t type,uint16_t len,int wait)921 m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait)
922 {
923 struct aqm_tag_container *tag_container;
924 struct m_tag *tag = NULL;
925
926 assert3u(id, ==, KERNEL_MODULE_TAG_ID);
927 assert3u(type, ==, KERNEL_TAG_TYPE_AQM);
928 assert3u(len, ==, sizeof(uint64_t));
929
930 if (len != sizeof(uint64_t)) {
931 return NULL;
932 }
933
934 tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO);
935 if (tag_container != NULL) {
936 tag = &tag_container->aqm_m_tag;
937
938 assert3p(tag, ==, tag_container);
939
940 M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL);
941 }
942
943 return tag;
944 }
945
946 static void
m_tag_kfree_aqm(struct m_tag * tag)947 m_tag_kfree_aqm(struct m_tag *tag)
948 {
949 struct aqm_tag_container *__single tag_container = (struct aqm_tag_container *)tag;
950
951 assert3u(tag->m_tag_len, ==, sizeof(uint64_t));
952
953 kfree_type(struct aqm_tag_container, tag_container);
954 }
955
956 void
pktsched_register_m_tag(void)957 pktsched_register_m_tag(void)
958 {
959 int error;
960
961 error = m_register_internal_tag_type(KERNEL_TAG_TYPE_AQM, sizeof(uint64_t),
962 m_tag_kalloc_aqm, m_tag_kfree_aqm);
963
964 assert3u(error, ==, 0);
965 }
966