xref: /xnu-12377.1.9/bsd/net/pktsched/pktsched.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39 
40 #include <dev/random/randomdev.h>
41 #include <net/droptap.h>
42 #include <net/if.h>
43 #include <net/if_var.h>
44 #include <net/if_dl.h>
45 #include <net/if_types.h>
46 #include <net/net_osdep.h>
47 #include <net/droptap.h>
48 #include <net/pktsched/pktsched.h>
49 #include <net/pktsched/pktsched_ops.h>
50 #include <net/pktsched/pktsched_fq_codel.h>
51 #include <net/pktsched/pktsched_netem.h>
52 
53 #define _IP_VHL
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 
57 #include <pexpert/pexpert.h>
58 
59 #if SKYWALK
60 #include <skywalk/os_skywalk_private.h>
61 #endif /* SKYWALK */
62 
63 u_int32_t machclk_freq = 0;
64 u_int64_t machclk_per_sec = 0;
65 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
66 
67 static void init_machclk(void);
68 
69 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
70 
71 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
72     &pktsched_verbose, 0, "Packet scheduler verbosity level");
73 
74 static void
pktsched_teardown_noop(__unused struct ifclassq * ifq)75 pktsched_teardown_noop(__unused struct ifclassq *ifq)
76 {
77 	return;
78 }
79 
80 static int
pktsched_request_noop(struct ifclassq * ifq,cqrq_t rq,void * arg)81 pktsched_request_noop(struct ifclassq *ifq, cqrq_t rq, void *arg)
82 {
83 #pragma unused(ifq, rq, arg)
84 	return ENXIO;
85 }
86 
87 static int
pktsched_getqstats_noop(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)88 pktsched_getqstats_noop(struct ifclassq *ifq,
89     uint8_t gid, u_int32_t qid,
90     struct if_ifclassq_stats *ifqs)
91 {
92 #pragma unused(ifq, gid, qid, ifqs)
93 	return ENXIO;
94 }
95 
96 static int
pktsched_enqueue_noop(struct ifclassq * ifq,classq_pkt_t * h,classq_pkt_t * t,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)97 pktsched_enqueue_noop(struct ifclassq *ifq,
98     classq_pkt_t *h, classq_pkt_t *t, uint32_t cnt,
99     uint32_t bytes, boolean_t *pdrop)
100 {
101 	pktsched_pkt_t pkt;
102 	pktsched_pkt_encap_chain(&pkt, h, t, cnt, bytes);
103 	if (__improbable(droptap_verbose > 0)) {
104 		pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
105 		    __func__, __LINE__, 0);
106 	} else {
107 		pktsched_free_pkt(&pkt);
108 	}
109 
110 	*pdrop = true;
111 	return ENXIO;
112 }
113 
114 static int
pktsched_dequeue_noop(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)115 pktsched_dequeue_noop(struct ifclassq *ifq,
116     u_int32_t maxpktcnt, u_int32_t maxbytecnt,
117     classq_pkt_t *first_packet, classq_pkt_t *last_packet,
118     u_int32_t *retpktcnt, u_int32_t *retbytecnt,
119     uint8_t grp_idx)
120 {
121 #pragma unused(ifq, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx)
122 	return ENXIO;
123 }
124 
125 static int
pktsched_dequeue_sc_noop(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)126 pktsched_dequeue_sc_noop(struct ifclassq *ifq,
127     mbuf_svc_class_t svc, u_int32_t maxpktcnt,
128     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
129     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
130     u_int32_t *retbytecnt, uint8_t grp_idx)
131 {
132 #pragma unused(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx)
133 	return ENXIO;
134 }
135 
136 static int
pktsched_setup_noop(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)137 pktsched_setup_noop(struct ifclassq *ifq, u_int32_t flags,
138     classq_pkt_type_t ptype)
139 {
140 #pragma unused(ifq, flags, ptype)
141 	return ENXIO;
142 }
143 
144 static boolean_t
pktsched_allow_dequeue_noop(struct ifclassq * ifq)145 pktsched_allow_dequeue_noop(struct ifclassq *ifq)
146 {
147 #pragma unused(ifq)
148 	return false;
149 }
150 
151 struct pktsched_ops pktsched_noops = {
152 	.ps_id             = PKTSCHEDT_NONE,
153 	.ps_setup          = pktsched_setup_noop,
154 	.ps_teardown       = pktsched_teardown_noop,
155 	.ps_enq            = pktsched_enqueue_noop,
156 	.ps_deq            = pktsched_dequeue_noop,
157 	.ps_deq_sc         = pktsched_dequeue_sc_noop,
158 	.ps_req            = pktsched_request_noop,
159 	.ps_stats          = pktsched_getqstats_noop,
160 	.ps_allow_dequeue  = pktsched_allow_dequeue_noop,
161 };
162 
163 void
pktsched_init(void)164 pktsched_init(void)
165 {
166 	init_machclk();
167 	if (machclk_freq == 0) {
168 		panic("%s: no CPU clock available!", __func__);
169 		/* NOTREACHED */
170 	}
171 	pktsched_ops_register(&pktsched_noops);
172 	pktsched_fq_init();
173 }
174 
175 static void
init_machclk(void)176 init_machclk(void)
177 {
178 	/*
179 	 * Initialize machclk_freq using the timerbase frequency
180 	 * value from device specific info.
181 	 */
182 	machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
183 
184 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
185 	    &machclk_per_sec);
186 }
187 
188 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)189 pktsched_abs_to_nsecs(u_int64_t abstime)
190 {
191 	u_int64_t nsecs;
192 
193 	absolutetime_to_nanoseconds(abstime, &nsecs);
194 	return nsecs;
195 }
196 
197 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)198 pktsched_nsecs_to_abstime(u_int64_t nsecs)
199 {
200 	u_int64_t abstime;
201 
202 	nanoseconds_to_absolutetime(nsecs, &abstime);
203 	return abstime;
204 }
205 
206 int
pktsched_setup(struct ifclassq * ifq,u_int8_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)207 pktsched_setup(struct ifclassq *ifq, u_int8_t scheduler, u_int32_t sflags,
208     classq_pkt_type_t ptype)
209 {
210 	int error = 0;
211 	u_int32_t rflags;
212 	pktsched_ops_t *ops;
213 
214 	IFCQ_LOCK_ASSERT_HELD(ifq);
215 
216 	VERIFY(machclk_freq != 0);
217 
218 	/* Nothing to do unless the scheduler type changes */
219 	if (ifq->ifcq_type == scheduler) {
220 		return 0;
221 	}
222 
223 	/*
224 	 * Remember the flags that need to be restored upon success, as
225 	 * they may be cleared when we tear down existing scheduler.
226 	 */
227 	rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
228 
229 	if (ifq->ifcq_type != PKTSCHEDT_NONE) {
230 		/* Don't support changing qdisc for fq_codel that has multiple groups */
231 		if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
232 			fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
233 			uint8_t grp_idx;
234 			for (grp_idx = 1; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
235 				if (fqs->fqs_classq_groups[grp_idx] != NULL) {
236 					return ENOTSUP;
237 				}
238 			}
239 		}
240 
241 		pktsched_teardown(ifq);
242 
243 		/* Teardown should have succeeded */
244 		VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
245 		VERIFY(ifq->ifcq_disc == NULL);
246 	}
247 
248 	ops = pktsched_ops_find(scheduler);
249 	ASSERT(ops != NULL);
250 	ifq->ifcq_ops = ops;
251 	error = ops->ps_setup(ifq, sflags, ptype);
252 	if (error == 0) {
253 		ifq->ifcq_flags |= rflags;
254 	}
255 	if (ops->ps_ops_flags & PKTSCHED_OPS_LOCKLESS) {
256 		ifq->ifcq_flags |= IFCQF_LOCKLESS;
257 	}
258 
259 	return error;
260 }
261 
262 void
pktsched_teardown(struct ifclassq * ifq)263 pktsched_teardown(struct ifclassq *ifq)
264 {
265 	IFCQ_LOCK_ASSERT_HELD(ifq);
266 	ifq->ifcq_ops->ps_req(ifq, CLASSQRQ_PURGE, 0);
267 	VERIFY(IFCQ_IS_EMPTY(ifq));
268 	ifq->ifcq_flags &= ~IFCQF_ENABLED;
269 	ifq->ifcq_ops->ps_teardown(ifq);
270 	return;
271 }
272 
273 // TODO: change function signature to be more generic
274 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)275 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
276     struct if_ifclassq_stats *ifqs)
277 {
278 	IFCQ_LOCK_ASSERT_HELD(ifq);
279 
280 	return ifq->ifcq_ops->ps_stats(ifq, (uint8_t)gid, qid, ifqs);
281 }
282 
283 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)284 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
285 {
286 	pkt->pktsched_pkt = *cpkt;
287 	pkt->pktsched_tail = *cpkt;
288 	pkt->pktsched_pcnt = 1;
289 
290 	switch (cpkt->cp_ptype) {
291 	case QP_MBUF:
292 		pkt->pktsched_plen =
293 		    (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
294 		break;
295 
296 #if SKYWALK
297 	case QP_PACKET:
298 		pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
299 		break;
300 #endif /* SKYWALK */
301 
302 	default:
303 		VERIFY(0);
304 		/* NOTREACHED */
305 		__builtin_unreachable();
306 	}
307 }
308 
309 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)310 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
311     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
312 {
313 	pkt->pktsched_pkt = *cpkt;
314 	pkt->pktsched_tail = *tail;
315 	pkt->pktsched_pcnt = cnt;
316 	pkt->pktsched_plen = bytes;
317 
318 	switch (cpkt->cp_ptype) {
319 	case QP_MBUF:
320 		break;
321 
322 #if SKYWALK
323 	case QP_PACKET:
324 		break;
325 #endif /* SKYWALK */
326 
327 	default:
328 		VERIFY(0);
329 		/* NOTREACHED */
330 		__builtin_unreachable();
331 	}
332 }
333 
334 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)335 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
336 {
337 	struct mbuf *m1, *m2;
338 #if SKYWALK
339 	struct __kern_packet *p1;
340 	kern_packet_t ph2;
341 	int err;
342 #endif /* SKYWALK */
343 
344 	ASSERT(pkt1 != NULL);
345 	ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
346 	ASSERT(pkt1->pktsched_pcnt == 1);
347 
348 	/* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
349 	ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
350 	    pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
351 	    pkt2->pktsched_pkt_mbuf == NULL));
352 
353 	switch (pkt1->pktsched_ptype) {
354 	case QP_MBUF:
355 		m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
356 		m2 = m_dup(m1, M_NOWAIT);
357 		if (__improbable(m2 == NULL)) {
358 			return ENOBUFS;
359 		}
360 		pkt2->pktsched_pkt_mbuf = m2;
361 		break;
362 
363 #if SKYWALK
364 	case QP_PACKET:
365 		p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
366 		err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
367 		    METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
368 		    KPKT_COPY_HEAVY);
369 		if (__improbable(err != 0)) {
370 			return err;
371 		}
372 		ASSERT(ph2 != 0);
373 		VERIFY(kern_packet_finalize(ph2) == 0);
374 		pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
375 		break;
376 #endif /* SKYWALK */
377 
378 	default:
379 		VERIFY(0);
380 		/* NOTREACHED */
381 		__builtin_unreachable();
382 	}
383 
384 	pkt2->pktsched_plen = pkt1->pktsched_plen;
385 	pkt2->pktsched_ptype = pkt1->pktsched_ptype;
386 	pkt2->pktsched_tail = pkt2->pktsched_pkt;
387 	pkt2->pktsched_pcnt = 1;
388 	return 0;
389 }
390 
391 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)392 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
393 {
394 	struct mbuf *m = NULL;
395 	uint8_t *data = NULL;
396 	uint32_t data_len = 0;
397 	uint32_t rand32, rand_off, rand_bit;
398 #if SKYWALK
399 	struct __kern_packet *p = NULL;
400 #endif /* SKYWALK */
401 
402 	switch (pkt->pktsched_ptype) {
403 	case QP_MBUF:
404 		m = pkt->pktsched_pkt_mbuf;
405 		data = mtod(m, uint8_t *);
406 		data_len = m->m_pkthdr.len;
407 		break;
408 #if SKYWALK
409 	case QP_PACKET:
410 		p = pkt->pktsched_pkt_kpkt;
411 		if (p->pkt_pflags & PKT_F_MBUF_DATA) {
412 			m = p->pkt_mbuf;
413 			data = mtod(m, uint8_t *);
414 			data_len = m->m_pkthdr.len;
415 		} else {
416 			MD_BUFLET_ADDR_DLEN(p, data, data_len);
417 		}
418 		break;
419 #endif /* SKYWALK */
420 
421 	default:
422 		/* NOTREACHED */
423 		VERIFY(0);
424 		__builtin_unreachable();
425 	}
426 
427 	read_frandom(&rand32, sizeof(rand32));
428 	rand_bit = rand32 & 0x7;
429 	rand_off = (rand32 >> 3) % data_len;
430 	data[rand_off] ^= (uint8_t)(1 << rand_bit);
431 }
432 
433 void
pktsched_free_pkt(pktsched_pkt_t * pkt)434 pktsched_free_pkt(pktsched_pkt_t *pkt)
435 {
436 	uint32_t cnt = pkt->pktsched_pcnt;
437 	ASSERT(cnt != 0);
438 
439 	switch (pkt->pktsched_ptype) {
440 	case QP_MBUF: {
441 		struct mbuf *m;
442 
443 		m = pkt->pktsched_pkt_mbuf;
444 		if (cnt == 1) {
445 			VERIFY(m->m_nextpkt == NULL);
446 		} else {
447 			VERIFY(m->m_nextpkt != NULL);
448 		}
449 		m_freem_list(m);
450 		break;
451 	}
452 #if SKYWALK
453 	case QP_PACKET: {
454 		struct __kern_packet *kpkt;
455 		int pcnt = 0;
456 
457 		kpkt = pkt->pktsched_pkt_kpkt;
458 		if (cnt == 1) {
459 			VERIFY(kpkt->pkt_nextpkt == NULL);
460 		} else {
461 			VERIFY(kpkt->pkt_nextpkt != NULL);
462 		}
463 		pp_free_packet_chain(kpkt, &pcnt);
464 		VERIFY(cnt == (uint32_t)pcnt);
465 		break;
466 	}
467 #endif /* SKYWALK */
468 
469 	default:
470 		VERIFY(0);
471 		/* NOTREACHED */
472 		__builtin_unreachable();
473 	}
474 	pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
475 	pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
476 	pkt->pktsched_plen = 0;
477 	pkt->pktsched_pcnt = 0;
478 }
479 
480 void
pktsched_drop_pkt(pktsched_pkt_t * pkt,struct ifnet * ifp,drop_reason_t reason,const char * funcname,uint16_t linenum,uint16_t flags)481 pktsched_drop_pkt(pktsched_pkt_t *pkt, struct ifnet *ifp, drop_reason_t reason, const char *funcname,
482     uint16_t linenum, uint16_t flags)
483 {
484 	if (__probable(droptap_total_tap_count == 0)) {
485 		pktsched_free_pkt(pkt);
486 		return;
487 	}
488 
489 	uint32_t cnt = pkt->pktsched_pcnt;
490 	ASSERT(cnt != 0);
491 
492 	switch (pkt->pktsched_ptype) {
493 	case QP_MBUF: {
494 		struct mbuf *m;
495 
496 		m = pkt->pktsched_pkt_mbuf;
497 		if (cnt == 1) {
498 			VERIFY(m->m_nextpkt == NULL);
499 		} else {
500 			VERIFY(m->m_nextpkt != NULL);
501 		}
502 		m_drop_list(m, ifp, flags | DROPTAP_FLAG_DIR_OUT, reason, funcname, linenum);
503 		break;
504 	}
505 #if SKYWALK
506 	case QP_PACKET: {
507 		struct __kern_packet *kpkt;
508 
509 		kpkt = pkt->pktsched_pkt_kpkt;
510 		if (cnt == 1) {
511 			VERIFY(kpkt->pkt_nextpkt == NULL);
512 		} else {
513 			VERIFY(kpkt->pkt_nextpkt != NULL);
514 		}
515 		droptap_output_packet(SK_PKT2PH(kpkt), reason, funcname, linenum,
516 		    flags, ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
517 		pktsched_free_pkt(pkt);
518 		break;
519 	}
520 #endif /* SKYWALK */
521 
522 	default:
523 		VERIFY(0);
524 		/* NOTREACHED */
525 		__builtin_unreachable();
526 	}
527 }
528 
529 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)530 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
531 {
532 	mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
533 
534 	switch (pkt->pktsched_ptype) {
535 	case QP_MBUF:
536 		svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
537 		break;
538 
539 #if SKYWALK
540 	case QP_PACKET:
541 		svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
542 		break;
543 #endif /* SKYWALK */
544 
545 	default:
546 		VERIFY(0);
547 		/* NOTREACHED */
548 		__builtin_unreachable();
549 	}
550 
551 	return svc;
552 }
553 
554 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt,uint64_t * pkt_tx_time)555 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
556     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
557     uint32_t *comp_gencnt, uint64_t *pkt_tx_time)
558 {
559 	switch (pkt->pktsched_ptype) {
560 	case QP_MBUF: {
561 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
562 
563 		if (flags != NULL) {
564 			*flags = &pkth->pkt_flags;
565 		}
566 		if (timestamp != NULL) {
567 			*timestamp = &pkth->pkt_timestamp;
568 		}
569 		if (flowid != NULL) {
570 			*flowid = pkth->pkt_flowid;
571 		}
572 		if (flowsrc != NULL) {
573 			*flowsrc = pkth->pkt_flowsrc;
574 		}
575 		if (proto != NULL) {
576 			/*
577 			 * rdar://100524205 - We want to use the pkt_ext_flags
578 			 * to denote QUIC packets, but AQM is already written in
579 			 * such a way where IPPROTO_QUIC is used to denote QUIC
580 			 * packets.
581 			 */
582 			if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
583 				*proto = IPPROTO_QUIC;
584 			} else {
585 				*proto = pkth->pkt_proto;
586 			}
587 		}
588 		if (comp_gencnt != NULL) {
589 			*comp_gencnt = pkth->comp_gencnt;
590 		}
591 		if (pkt_tx_time != NULL) {
592 			struct m_tag *tag;
593 			tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID,
594 			    KERNEL_TAG_TYPE_AQM);
595 			if (__improbable(tag != NULL)) {
596 				*pkt_tx_time = *(uint64_t *)tag->m_tag_data;
597 			} else {
598 				*pkt_tx_time = 0;
599 			}
600 		}
601 
602 		break;
603 	}
604 
605 #if SKYWALK
606 	case QP_PACKET: {
607 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
608 
609 		if (flags != NULL) {
610 			/* use lower-32 bit for common flags */
611 			*flags = &kp->pkt_pflags32;
612 		}
613 		if (timestamp != NULL) {
614 			*timestamp = &kp->pkt_timestamp;
615 		}
616 		if (flowid != NULL) {
617 			*flowid = kp->pkt_flow_token;
618 		}
619 		if (flowsrc != NULL) {
620 			*flowsrc = (uint8_t)kp->pkt_flowsrc_type;
621 		}
622 		if (proto != NULL) {
623 			*proto = kp->pkt_transport_protocol;
624 		}
625 		if (comp_gencnt != NULL) {
626 			*comp_gencnt = kp->pkt_comp_gencnt;
627 		}
628 		if (pkt_tx_time != NULL) {
629 			*pkt_tx_time = __packet_get_tx_timestamp(SK_PKT2PH(kp));
630 		}
631 
632 		break;
633 	}
634 #endif /* SKYWALK */
635 
636 	default:
637 		VERIFY(0);
638 		/* NOTREACHED */
639 		__builtin_unreachable();
640 	}
641 }
642 
643 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)644 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
645 {
646 #pragma unused(ifp)
647 	struct flowadv_fcentry *fce = NULL;
648 
649 	switch (pkt->pktsched_ptype) {
650 	case QP_MBUF: {
651 		struct mbuf *m = pkt->pktsched_pkt_mbuf;
652 
653 		fce = flowadv_alloc_entry(how);
654 		if (fce == NULL) {
655 			break;
656 		}
657 
658 		static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(fce->fce_flowid));
659 
660 		fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
661 		fce->fce_flowid = m->m_pkthdr.pkt_flowid;
662 #if SKYWALK
663 		static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(fce->fce_flowsrc_token));
664 		static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(fce->fce_flowsrc_fidx));
665 
666 		if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
667 			fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
668 			fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
669 			fce->fce_ifp = ifp;
670 		}
671 #endif /* SKYWALK */
672 		break;
673 	}
674 
675 #if SKYWALK
676 	case QP_PACKET: {
677 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
678 
679 		fce = flowadv_alloc_entry(how);
680 		if (fce == NULL) {
681 			break;
682 		}
683 
684 		static_assert(sizeof(fce->fce_flowid) == sizeof(kp->pkt_flow_token));
685 		static_assert(sizeof(fce->fce_flowsrc_fidx) == sizeof(kp->pkt_flowsrc_fidx));
686 		static_assert(sizeof(fce->fce_flowsrc_token) == sizeof(kp->pkt_flowsrc_token));
687 
688 		ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
689 		fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
690 		fce->fce_flowid = kp->pkt_flow_token;
691 		fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
692 		fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
693 		fce->fce_ifp = ifp;
694 		break;
695 	}
696 #endif /* SKYWALK */
697 
698 	default:
699 		VERIFY(0);
700 		/* NOTREACHED */
701 		__builtin_unreachable();
702 	}
703 
704 	return fce;
705 }
706 
707 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)708 pktsched_mbuf_mark_ecn(struct mbuf* m)
709 {
710 	struct mbuf     *m0;
711 	void            *__single hdr;
712 	int             af;
713 	uint8_t         ipv;
714 
715 	hdr = m->m_pkthdr.pkt_hdr;
716 	/* verify that hdr is within the mbuf data */
717 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
718 		if (((caddr_t)hdr >= m_mtod_current(m0)) &&
719 		    ((caddr_t)hdr < m_mtod_current(m0) + m0->m_len)) {
720 			break;
721 		}
722 	}
723 	if (m0 == NULL) {
724 		return EINVAL;
725 	}
726 	ipv = IP_VHL_V(*(uint8_t *)hdr);
727 	if (ipv == 4) {
728 		af = AF_INET;
729 	} else if (ipv == 6) {
730 		af = AF_INET6;
731 	} else {
732 		af = AF_UNSPEC;
733 	}
734 
735 	switch (af) {
736 	case AF_INET: {
737 		struct ip *__single ip = (struct ip *)(void *)hdr;
738 		uint8_t otos;
739 		int sum;
740 
741 		if (((uintptr_t)ip + sizeof(*ip)) >
742 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
743 			return EINVAL;    /* out of bounds */
744 		}
745 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
746 			return EINVAL;    /* not-ECT */
747 		}
748 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
749 			return 0;    /* already marked */
750 		}
751 		/*
752 		 * ecn-capable but not marked,
753 		 * mark CE and update checksum
754 		 */
755 		otos = ip->ip_tos;
756 		ip->ip_tos |= IPTOS_ECN_CE;
757 		/*
758 		 * update checksum (from RFC1624) only if hw
759 		 * checksum is not supported.
760 		 *         HC' = ~(~HC + ~m + m')
761 		 */
762 		if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
763 			sum = ~ntohs(ip->ip_sum) & 0xffff;
764 			sum += (~otos & 0xffff) + ip->ip_tos;
765 			sum = (sum >> 16) + (sum & 0xffff);
766 			sum += (sum >> 16); /* add carry */
767 			ip->ip_sum = htons(~sum & 0xffff);
768 		}
769 		return 0;
770 	}
771 	case AF_INET6: {
772 		struct ip6_hdr *__single ip6 = (struct ip6_hdr *)(void *)hdr;
773 		u_int32_t flowlabel;
774 
775 		if (((uintptr_t)ip6 + sizeof(*ip6)) >
776 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
777 			return EINVAL;    /* out of bounds */
778 		}
779 		flowlabel = ntohl(ip6->ip6_flow);
780 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
781 		    (IPTOS_ECN_NOTECT << 20)) {
782 			return EINVAL;    /* not-ECT */
783 		}
784 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
785 		    (IPTOS_ECN_CE << 20)) {
786 			return 0;    /* already marked */
787 		}
788 		/*
789 		 * ecn-capable but not marked,  mark CE
790 		 */
791 		flowlabel |= (IPTOS_ECN_CE << 20);
792 		ip6->ip6_flow = htonl(flowlabel);
793 		return 0;
794 	}
795 	default:
796 		return EPROTONOSUPPORT;
797 	}
798 }
799 
800 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)801 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
802 {
803 	uint8_t ipv = 0, *l3_hdr;
804 
805 	if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) {
806 		uint32_t l3_len = 0;
807 		ipv = kpkt->pkt_flow_ip_ver;
808 		l3_len = kpkt->pkt_length - kpkt->pkt_l2_len;
809 		l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *, kpkt->pkt_flow_ip_hdr, l3_len);
810 	} else {
811 		uint8_t *pkt_buf;
812 		uint32_t bdlen, bdlim, bdoff;
813 		MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
814 
815 		/* takes care of both IPv4 and IPv6 */
816 		l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
817 		ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
818 		if (ipv == 4) {
819 			ipv = IPVERSION;
820 		} else if (ipv == 6) {
821 			ipv = IPV6_VERSION;
822 		} else {
823 			ipv = 0;
824 		}
825 	}
826 
827 	switch (ipv) {
828 	case IPVERSION: {
829 		uint8_t otos;
830 		int sum;
831 
832 		struct ip *ip = (struct ip *)(void *)l3_hdr;
833 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
834 			return EINVAL;    /* not-ECT */
835 		}
836 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
837 			return 0;    /* already marked */
838 		}
839 		/*
840 		 * ecn-capable but not marked,
841 		 * mark CE and update checksum
842 		 */
843 		otos = ip->ip_tos;
844 		ip->ip_tos |= IPTOS_ECN_CE;
845 
846 		sum = ~ntohs(ip->ip_sum) & 0xffff;
847 		sum += (~otos & 0xffff) + ip->ip_tos;
848 		sum = (sum >> 16) + (sum & 0xffff);
849 		sum += (sum >> 16); /* add carry */
850 		ip->ip_sum = htons(~sum & 0xffff);
851 
852 		return 0;
853 	}
854 	case IPV6_VERSION: {
855 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(void *)l3_hdr;
856 		u_int32_t flowlabel;
857 		flowlabel = ntohl(ip6->ip6_flow);
858 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
859 		    (IPTOS_ECN_NOTECT << 20)) {
860 			return EINVAL;    /* not-ECT */
861 		}
862 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
863 		    (IPTOS_ECN_CE << 20)) {
864 			return 0;    /* already marked */
865 		}
866 		/*
867 		 * ecn-capable but not marked, mark CE
868 		 */
869 		flowlabel |= (IPTOS_ECN_CE << 20);
870 		ip6->ip6_flow = htonl(flowlabel);
871 
872 		return 0;
873 	}
874 	default:
875 		return EPROTONOSUPPORT;
876 	}
877 }
878 
879 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)880 pktsched_mark_ecn(pktsched_pkt_t *pkt)
881 {
882 	switch (pkt->pktsched_ptype) {
883 	case QP_MBUF:
884 		return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
885 	case QP_PACKET:
886 		return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
887 	default:
888 		VERIFY(0);
889 		/* NOTREACHED */
890 		__builtin_unreachable();
891 	}
892 }
893 
894 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)895 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
896 {
897 	switch (pkt->pktsched_ptype) {
898 	case QP_MBUF: {
899 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
900 		return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
901 	}
902 	case QP_PACKET: {
903 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
904 		return (kp->pkt_pflags & PKT_F_L4S) != 0;
905 	}
906 
907 	default:
908 		VERIFY(0);
909 		/* NOTREACHED */
910 		__builtin_unreachable();
911 	}
912 	return FALSE;
913 }
914 
915 struct aqm_tag_container {
916 	struct m_tag            aqm_m_tag;
917 	uint64_t                aqm_tag;
918 };
919 
920 static struct  m_tag *
m_tag_kalloc_aqm(u_int32_t id,u_int16_t type,uint16_t len,int wait)921 m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait)
922 {
923 	struct aqm_tag_container *tag_container;
924 	struct m_tag *tag = NULL;
925 
926 	assert3u(id, ==, KERNEL_MODULE_TAG_ID);
927 	assert3u(type, ==, KERNEL_TAG_TYPE_AQM);
928 	assert3u(len, ==, sizeof(uint64_t));
929 
930 	if (len != sizeof(uint64_t)) {
931 		return NULL;
932 	}
933 
934 	tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO);
935 	if (tag_container != NULL) {
936 		tag = &tag_container->aqm_m_tag;
937 
938 		assert3p(tag, ==, tag_container);
939 
940 		M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL);
941 	}
942 
943 	return tag;
944 }
945 
946 static void
m_tag_kfree_aqm(struct m_tag * tag)947 m_tag_kfree_aqm(struct m_tag *tag)
948 {
949 	struct aqm_tag_container *__single tag_container = (struct aqm_tag_container *)tag;
950 
951 	assert3u(tag->m_tag_len, ==, sizeof(uint64_t));
952 
953 	kfree_type(struct aqm_tag_container, tag_container);
954 }
955 
956 void
pktsched_register_m_tag(void)957 pktsched_register_m_tag(void)
958 {
959 	int error;
960 
961 	error = m_register_internal_tag_type(KERNEL_TAG_TYPE_AQM, sizeof(uint64_t),
962 	    m_tag_kalloc_aqm, m_tag_kfree_aqm);
963 
964 	assert3u(error, ==, 0);
965 }
966