xref: /xnu-11417.121.6/bsd/net/pktsched/pktsched.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39 
40 #include <dev/random/randomdev.h>
41 #include <net/droptap.h>
42 #include <net/if.h>
43 #include <net/if_var.h>
44 #include <net/if_dl.h>
45 #include <net/if_types.h>
46 #include <net/net_osdep.h>
47 #include <net/droptap.h>
48 #include <net/pktsched/pktsched.h>
49 #include <net/pktsched/pktsched_fq_codel.h>
50 #include <net/pktsched/pktsched_netem.h>
51 
52 #define _IP_VHL
53 #include <netinet/ip.h>
54 #include <netinet/ip6.h>
55 
56 #include <pexpert/pexpert.h>
57 
58 #if SKYWALK
59 #include <skywalk/os_skywalk_private.h>
60 #endif /* SKYWALK */
61 
62 u_int32_t machclk_freq = 0;
63 u_int64_t machclk_per_sec = 0;
64 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
65 
66 static void init_machclk(void);
67 
68 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
69 
70 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
71     &pktsched_verbose, 0, "Packet scheduler verbosity level");
72 
73 void
pktsched_init(void)74 pktsched_init(void)
75 {
76 	init_machclk();
77 	if (machclk_freq == 0) {
78 		panic("%s: no CPU clock available!", __func__);
79 		/* NOTREACHED */
80 	}
81 	pktsched_fq_init();
82 }
83 
84 static void
init_machclk(void)85 init_machclk(void)
86 {
87 	/*
88 	 * Initialize machclk_freq using the timerbase frequency
89 	 * value from device specific info.
90 	 */
91 	machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
92 
93 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
94 	    &machclk_per_sec);
95 }
96 
97 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)98 pktsched_abs_to_nsecs(u_int64_t abstime)
99 {
100 	u_int64_t nsecs;
101 
102 	absolutetime_to_nanoseconds(abstime, &nsecs);
103 	return nsecs;
104 }
105 
106 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)107 pktsched_nsecs_to_abstime(u_int64_t nsecs)
108 {
109 	u_int64_t abstime;
110 
111 	nanoseconds_to_absolutetime(nsecs, &abstime);
112 	return abstime;
113 }
114 
115 int
pktsched_setup(struct ifclassq * ifq,u_int32_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)116 pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
117     classq_pkt_type_t ptype)
118 {
119 	int error = 0;
120 	u_int32_t rflags;
121 
122 	IFCQ_LOCK_ASSERT_HELD(ifq);
123 
124 	VERIFY(machclk_freq != 0);
125 
126 	/* Nothing to do unless the scheduler type changes */
127 	if (ifq->ifcq_type == scheduler) {
128 		return 0;
129 	}
130 
131 	/*
132 	 * Remember the flags that need to be restored upon success, as
133 	 * they may be cleared when we tear down existing scheduler.
134 	 */
135 	rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
136 
137 	if (ifq->ifcq_type != PKTSCHEDT_NONE) {
138 		pktsched_teardown(ifq);
139 
140 		/* Teardown should have succeeded */
141 		VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
142 		VERIFY(ifq->ifcq_disc == NULL);
143 	}
144 
145 	error = fq_if_setup_ifclassq(ifq, sflags, ptype);
146 	if (error == 0) {
147 		ifq->ifcq_flags |= rflags;
148 	}
149 
150 	return error;
151 }
152 
153 void
pktsched_teardown(struct ifclassq * ifq)154 pktsched_teardown(struct ifclassq *ifq)
155 {
156 	IFCQ_LOCK_ASSERT_HELD(ifq);
157 	if_qflush(ifq->ifcq_ifp, ifq, true);
158 	VERIFY(IFCQ_IS_EMPTY(ifq));
159 	ifq->ifcq_flags &= ~IFCQF_ENABLED;
160 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
161 		/* Could be PKTSCHEDT_NONE */
162 		fq_if_teardown_ifclassq(ifq);
163 	}
164 	return;
165 }
166 
167 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)168 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
169     struct if_ifclassq_stats *ifqs)
170 {
171 	int error = 0;
172 
173 	IFCQ_LOCK_ASSERT_HELD(ifq);
174 
175 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
176 		/* Could be PKTSCHEDT_NONE */
177 		error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs);
178 	}
179 
180 	return error;
181 }
182 
183 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)184 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
185 {
186 	pkt->pktsched_pkt = *cpkt;
187 	pkt->pktsched_tail = *cpkt;
188 	pkt->pktsched_pcnt = 1;
189 
190 	switch (cpkt->cp_ptype) {
191 	case QP_MBUF:
192 		pkt->pktsched_plen =
193 		    (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
194 		break;
195 
196 #if SKYWALK
197 	case QP_PACKET:
198 		pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
199 		break;
200 #endif /* SKYWALK */
201 
202 	default:
203 		VERIFY(0);
204 		/* NOTREACHED */
205 		__builtin_unreachable();
206 	}
207 }
208 
209 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)210 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
211     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
212 {
213 	pkt->pktsched_pkt = *cpkt;
214 	pkt->pktsched_tail = *tail;
215 	pkt->pktsched_pcnt = cnt;
216 	pkt->pktsched_plen = bytes;
217 
218 	switch (cpkt->cp_ptype) {
219 	case QP_MBUF:
220 		break;
221 
222 #if SKYWALK
223 	case QP_PACKET:
224 		break;
225 #endif /* SKYWALK */
226 
227 	default:
228 		VERIFY(0);
229 		/* NOTREACHED */
230 		__builtin_unreachable();
231 	}
232 }
233 
234 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)235 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
236 {
237 	struct mbuf *m1, *m2;
238 #if SKYWALK
239 	struct __kern_packet *p1;
240 	kern_packet_t ph2;
241 	int err;
242 #endif /* SKYWALK */
243 
244 	ASSERT(pkt1 != NULL);
245 	ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
246 	ASSERT(pkt1->pktsched_pcnt == 1);
247 
248 	/* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
249 	ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
250 	    pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
251 	    pkt2->pktsched_pkt_mbuf == NULL));
252 
253 	switch (pkt1->pktsched_ptype) {
254 	case QP_MBUF:
255 		m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
256 		m2 = m_dup(m1, M_NOWAIT);
257 		if (__improbable(m2 == NULL)) {
258 			return ENOBUFS;
259 		}
260 		pkt2->pktsched_pkt_mbuf = m2;
261 		break;
262 
263 #if SKYWALK
264 	case QP_PACKET:
265 		p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
266 		err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
267 		    METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
268 		    KPKT_COPY_HEAVY);
269 		if (__improbable(err != 0)) {
270 			return err;
271 		}
272 		ASSERT(ph2 != 0);
273 		VERIFY(kern_packet_finalize(ph2) == 0);
274 		pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
275 		break;
276 #endif /* SKYWALK */
277 
278 	default:
279 		VERIFY(0);
280 		/* NOTREACHED */
281 		__builtin_unreachable();
282 	}
283 
284 	pkt2->pktsched_plen = pkt1->pktsched_plen;
285 	pkt2->pktsched_ptype = pkt1->pktsched_ptype;
286 	pkt2->pktsched_tail = pkt2->pktsched_pkt;
287 	pkt2->pktsched_pcnt = 1;
288 	return 0;
289 }
290 
291 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)292 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
293 {
294 	struct mbuf *m = NULL;
295 	uint8_t *data = NULL;
296 	uint32_t data_len = 0;
297 	uint32_t rand32, rand_off, rand_bit;
298 #if SKYWALK
299 	struct __kern_packet *p = NULL;
300 #endif /* SKYWALK */
301 
302 	switch (pkt->pktsched_ptype) {
303 	case QP_MBUF:
304 		m = pkt->pktsched_pkt_mbuf;
305 		data = mtod(m, uint8_t *);
306 		data_len = m->m_pkthdr.len;
307 		break;
308 #if SKYWALK
309 	case QP_PACKET:
310 		p = pkt->pktsched_pkt_kpkt;
311 		if (p->pkt_pflags & PKT_F_MBUF_DATA) {
312 			m = p->pkt_mbuf;
313 			data = mtod(m, uint8_t *);
314 			data_len = m->m_pkthdr.len;
315 		} else {
316 			MD_BUFLET_ADDR_DLEN(p, data, data_len);
317 		}
318 		break;
319 #endif /* SKYWALK */
320 
321 	default:
322 		/* NOTREACHED */
323 		VERIFY(0);
324 		__builtin_unreachable();
325 	}
326 
327 	read_frandom(&rand32, sizeof(rand32));
328 	rand_bit = rand32 & 0x7;
329 	rand_off = (rand32 >> 3) % data_len;
330 	data[rand_off] ^= (uint8_t)(1 << rand_bit);
331 }
332 
333 void
pktsched_free_pkt(pktsched_pkt_t * pkt)334 pktsched_free_pkt(pktsched_pkt_t *pkt)
335 {
336 	uint32_t cnt = pkt->pktsched_pcnt;
337 	ASSERT(cnt != 0);
338 
339 	switch (pkt->pktsched_ptype) {
340 	case QP_MBUF: {
341 		struct mbuf *m;
342 
343 		m = pkt->pktsched_pkt_mbuf;
344 		if (cnt == 1) {
345 			VERIFY(m->m_nextpkt == NULL);
346 		} else {
347 			VERIFY(m->m_nextpkt != NULL);
348 		}
349 		m_freem_list(m);
350 		break;
351 	}
352 #if SKYWALK
353 	case QP_PACKET: {
354 		struct __kern_packet *kpkt;
355 		int pcnt = 0;
356 
357 		kpkt = pkt->pktsched_pkt_kpkt;
358 		if (cnt == 1) {
359 			VERIFY(kpkt->pkt_nextpkt == NULL);
360 		} else {
361 			VERIFY(kpkt->pkt_nextpkt != NULL);
362 		}
363 		pp_free_packet_chain(kpkt, &pcnt);
364 		VERIFY(cnt == (uint32_t)pcnt);
365 		break;
366 	}
367 #endif /* SKYWALK */
368 
369 	default:
370 		VERIFY(0);
371 		/* NOTREACHED */
372 		__builtin_unreachable();
373 	}
374 	pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
375 	pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
376 	pkt->pktsched_plen = 0;
377 	pkt->pktsched_pcnt = 0;
378 }
379 
380 void
pktsched_drop_pkt(pktsched_pkt_t * pkt,struct ifnet * ifp,drop_reason_t reason,const char * funcname,uint16_t linenum,uint16_t flags)381 pktsched_drop_pkt(pktsched_pkt_t *pkt, struct ifnet *ifp, drop_reason_t reason, const char *funcname,
382     uint16_t linenum, uint16_t flags)
383 {
384 	if (__probable(droptap_total_tap_count == 0)) {
385 		pktsched_free_pkt(pkt);
386 		return;
387 	}
388 
389 	uint32_t cnt = pkt->pktsched_pcnt;
390 	ASSERT(cnt != 0);
391 
392 	switch (pkt->pktsched_ptype) {
393 	case QP_MBUF: {
394 		struct mbuf *m;
395 
396 		m = pkt->pktsched_pkt_mbuf;
397 		if (cnt == 1) {
398 			VERIFY(m->m_nextpkt == NULL);
399 		} else {
400 			VERIFY(m->m_nextpkt != NULL);
401 		}
402 		m_drop_list(m, ifp, flags | DROPTAP_FLAG_DIR_OUT, reason, funcname, linenum);
403 		break;
404 	}
405 #if SKYWALK
406 	case QP_PACKET: {
407 		struct __kern_packet *kpkt;
408 
409 		kpkt = pkt->pktsched_pkt_kpkt;
410 		if (cnt == 1) {
411 			VERIFY(kpkt->pkt_nextpkt == NULL);
412 		} else {
413 			VERIFY(kpkt->pkt_nextpkt != NULL);
414 		}
415 		droptap_output_packet(SK_PKT2PH(kpkt), reason, funcname, linenum,
416 		    flags, ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
417 		break;
418 	}
419 #endif /* SKYWALK */
420 
421 	default:
422 		VERIFY(0);
423 		/* NOTREACHED */
424 		__builtin_unreachable();
425 	}
426 
427 	pktsched_free_pkt(pkt);
428 }
429 
430 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)431 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
432 {
433 	mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
434 
435 	switch (pkt->pktsched_ptype) {
436 	case QP_MBUF:
437 		svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
438 		break;
439 
440 #if SKYWALK
441 	case QP_PACKET:
442 		svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
443 		break;
444 #endif /* SKYWALK */
445 
446 	default:
447 		VERIFY(0);
448 		/* NOTREACHED */
449 		__builtin_unreachable();
450 	}
451 
452 	return svc;
453 }
454 
455 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt,uint64_t * pkt_tx_time)456 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
457     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
458     uint32_t *comp_gencnt, uint64_t *pkt_tx_time)
459 {
460 	switch (pkt->pktsched_ptype) {
461 	case QP_MBUF: {
462 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
463 
464 		if (flags != NULL) {
465 			*flags = &pkth->pkt_flags;
466 		}
467 		if (timestamp != NULL) {
468 			*timestamp = &pkth->pkt_timestamp;
469 		}
470 		if (flowid != NULL) {
471 			*flowid = pkth->pkt_flowid;
472 		}
473 		if (flowsrc != NULL) {
474 			*flowsrc = pkth->pkt_flowsrc;
475 		}
476 		if (proto != NULL) {
477 			/*
478 			 * rdar://100524205 - We want to use the pkt_ext_flags
479 			 * to denote QUIC packets, but AQM is already written in
480 			 * such a way where IPPROTO_QUIC is used to denote QUIC
481 			 * packets.
482 			 */
483 			if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
484 				*proto = IPPROTO_QUIC;
485 			} else {
486 				*proto = pkth->pkt_proto;
487 			}
488 		}
489 		if (comp_gencnt != NULL) {
490 			*comp_gencnt = pkth->comp_gencnt;
491 		}
492 		if (pkt_tx_time != NULL) {
493 			struct m_tag *tag;
494 			tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID,
495 			    KERNEL_TAG_TYPE_AQM);
496 			if (__improbable(tag != NULL)) {
497 				*pkt_tx_time = *(uint64_t *)tag->m_tag_data;
498 			} else {
499 				*pkt_tx_time = 0;
500 			}
501 		}
502 
503 		break;
504 	}
505 
506 #if SKYWALK
507 	case QP_PACKET: {
508 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
509 
510 		if (flags != NULL) {
511 			/* use lower-32 bit for common flags */
512 			*flags = &kp->pkt_pflags32;
513 		}
514 		if (timestamp != NULL) {
515 			*timestamp = &kp->pkt_timestamp;
516 		}
517 		if (flowid != NULL) {
518 			*flowid = kp->pkt_flow_token;
519 		}
520 		if (flowsrc != NULL) {
521 			*flowsrc = (uint8_t)kp->pkt_flowsrc_type;
522 		}
523 		if (proto != NULL) {
524 			*proto = kp->pkt_transport_protocol;
525 		}
526 		if (comp_gencnt != NULL) {
527 			*comp_gencnt = kp->pkt_comp_gencnt;
528 		}
529 		if (pkt_tx_time != NULL) {
530 			*pkt_tx_time = __packet_get_tx_timestamp(SK_PKT2PH(kp));
531 		}
532 
533 		break;
534 	}
535 #endif /* SKYWALK */
536 
537 	default:
538 		VERIFY(0);
539 		/* NOTREACHED */
540 		__builtin_unreachable();
541 	}
542 }
543 
544 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)545 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
546 {
547 #pragma unused(ifp)
548 	struct flowadv_fcentry *fce = NULL;
549 
550 	switch (pkt->pktsched_ptype) {
551 	case QP_MBUF: {
552 		struct mbuf *m = pkt->pktsched_pkt_mbuf;
553 
554 		fce = flowadv_alloc_entry(how);
555 		if (fce == NULL) {
556 			break;
557 		}
558 
559 		_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
560 		    sizeof(fce->fce_flowid));
561 
562 		fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
563 		fce->fce_flowid = m->m_pkthdr.pkt_flowid;
564 #if SKYWALK
565 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
566 		    sizeof(fce->fce_flowsrc_token));
567 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
568 		    sizeof(fce->fce_flowsrc_fidx));
569 
570 		if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
571 			fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
572 			fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
573 			fce->fce_ifp = ifp;
574 		}
575 #endif /* SKYWALK */
576 		break;
577 	}
578 
579 #if SKYWALK
580 	case QP_PACKET: {
581 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
582 
583 		fce = flowadv_alloc_entry(how);
584 		if (fce == NULL) {
585 			break;
586 		}
587 
588 		_CASSERT(sizeof(fce->fce_flowid) ==
589 		    sizeof(kp->pkt_flow_token));
590 		_CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
591 		    sizeof(kp->pkt_flowsrc_fidx));
592 		_CASSERT(sizeof(fce->fce_flowsrc_token) ==
593 		    sizeof(kp->pkt_flowsrc_token));
594 
595 		ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
596 		fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
597 		fce->fce_flowid = kp->pkt_flow_token;
598 		fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
599 		fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
600 		fce->fce_ifp = ifp;
601 		break;
602 	}
603 #endif /* SKYWALK */
604 
605 	default:
606 		VERIFY(0);
607 		/* NOTREACHED */
608 		__builtin_unreachable();
609 	}
610 
611 	return fce;
612 }
613 
614 uint32_t *
pktsched_get_pkt_sfb_vars(pktsched_pkt_t * pkt,uint32_t ** sfb_flags)615 pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
616 {
617 	uint32_t *hashp = NULL;
618 
619 	switch (pkt->pktsched_ptype) {
620 	case QP_MBUF: {
621 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
622 
623 		_CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
624 		_CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
625 		*sfb_flags = &pkth->pkt_mpriv_flags;
626 		hashp = &pkth->pkt_mpriv_hash;
627 		break;
628 	}
629 
630 #if SKYWALK
631 	case QP_PACKET: {
632 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
633 
634 		_CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
635 		_CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
636 		*sfb_flags = &kp->pkt_classq_flags;
637 		hashp = &kp->pkt_classq_hash;
638 		break;
639 	}
640 #endif /* SKYWALK */
641 
642 	default:
643 		VERIFY(0);
644 		/* NOTREACHED */
645 		__builtin_unreachable();
646 	}
647 
648 	return hashp;
649 }
650 
651 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)652 pktsched_mbuf_mark_ecn(struct mbuf* m)
653 {
654 	struct mbuf     *m0;
655 	void            *__single hdr;
656 	int             af;
657 	uint8_t         ipv;
658 
659 	hdr = m->m_pkthdr.pkt_hdr;
660 	/* verify that hdr is within the mbuf data */
661 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
662 		if (((caddr_t)hdr >= m_mtod_current(m0)) &&
663 		    ((caddr_t)hdr < m_mtod_current(m0) + m0->m_len)) {
664 			break;
665 		}
666 	}
667 	if (m0 == NULL) {
668 		return EINVAL;
669 	}
670 	ipv = IP_VHL_V(*(uint8_t *)hdr);
671 	if (ipv == 4) {
672 		af = AF_INET;
673 	} else if (ipv == 6) {
674 		af = AF_INET6;
675 	} else {
676 		af = AF_UNSPEC;
677 	}
678 
679 	switch (af) {
680 	case AF_INET: {
681 		struct ip *__single ip = (struct ip *)(void *)hdr;
682 		uint8_t otos;
683 		int sum;
684 
685 		if (((uintptr_t)ip + sizeof(*ip)) >
686 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
687 			return EINVAL;    /* out of bounds */
688 		}
689 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
690 			return EINVAL;    /* not-ECT */
691 		}
692 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
693 			return 0;    /* already marked */
694 		}
695 		/*
696 		 * ecn-capable but not marked,
697 		 * mark CE and update checksum
698 		 */
699 		otos = ip->ip_tos;
700 		ip->ip_tos |= IPTOS_ECN_CE;
701 		/*
702 		 * update checksum (from RFC1624) only if hw
703 		 * checksum is not supported.
704 		 *         HC' = ~(~HC + ~m + m')
705 		 */
706 		if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
707 			sum = ~ntohs(ip->ip_sum) & 0xffff;
708 			sum += (~otos & 0xffff) + ip->ip_tos;
709 			sum = (sum >> 16) + (sum & 0xffff);
710 			sum += (sum >> 16); /* add carry */
711 			ip->ip_sum = htons(~sum & 0xffff);
712 		}
713 		return 0;
714 	}
715 	case AF_INET6: {
716 		struct ip6_hdr *__single ip6 = (struct ip6_hdr *)(void *)hdr;
717 		u_int32_t flowlabel;
718 
719 		if (((uintptr_t)ip6 + sizeof(*ip6)) >
720 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
721 			return EINVAL;    /* out of bounds */
722 		}
723 		flowlabel = ntohl(ip6->ip6_flow);
724 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
725 		    (IPTOS_ECN_NOTECT << 20)) {
726 			return EINVAL;    /* not-ECT */
727 		}
728 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
729 		    (IPTOS_ECN_CE << 20)) {
730 			return 0;    /* already marked */
731 		}
732 		/*
733 		 * ecn-capable but not marked,  mark CE
734 		 */
735 		flowlabel |= (IPTOS_ECN_CE << 20);
736 		ip6->ip6_flow = htonl(flowlabel);
737 		return 0;
738 	}
739 	default:
740 		return EPROTONOSUPPORT;
741 	}
742 }
743 
744 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)745 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
746 {
747 	uint8_t ipv = 0, *l3_hdr;
748 
749 	if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) {
750 		uint32_t l3_len = 0;
751 		ipv = kpkt->pkt_flow_ip_ver;
752 		l3_len = kpkt->pkt_length - kpkt->pkt_l2_len;
753 		l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *, kpkt->pkt_flow_ip_hdr, l3_len);
754 	} else {
755 		uint8_t *pkt_buf;
756 		uint32_t bdlen, bdlim, bdoff;
757 		MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
758 
759 		/* takes care of both IPv4 and IPv6 */
760 		l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
761 		ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
762 		if (ipv == 4) {
763 			ipv = IPVERSION;
764 		} else if (ipv == 6) {
765 			ipv = IPV6_VERSION;
766 		} else {
767 			ipv = 0;
768 		}
769 	}
770 
771 	switch (ipv) {
772 	case IPVERSION: {
773 		uint8_t otos;
774 		int sum;
775 
776 		struct ip *ip = (struct ip *)(void *)l3_hdr;
777 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
778 			return EINVAL;    /* not-ECT */
779 		}
780 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
781 			return 0;    /* already marked */
782 		}
783 		/*
784 		 * ecn-capable but not marked,
785 		 * mark CE and update checksum
786 		 */
787 		otos = ip->ip_tos;
788 		ip->ip_tos |= IPTOS_ECN_CE;
789 
790 		sum = ~ntohs(ip->ip_sum) & 0xffff;
791 		sum += (~otos & 0xffff) + ip->ip_tos;
792 		sum = (sum >> 16) + (sum & 0xffff);
793 		sum += (sum >> 16); /* add carry */
794 		ip->ip_sum = htons(~sum & 0xffff);
795 
796 		return 0;
797 	}
798 	case IPV6_VERSION: {
799 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(void *)l3_hdr;
800 		u_int32_t flowlabel;
801 		flowlabel = ntohl(ip6->ip6_flow);
802 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
803 		    (IPTOS_ECN_NOTECT << 20)) {
804 			return EINVAL;    /* not-ECT */
805 		}
806 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
807 		    (IPTOS_ECN_CE << 20)) {
808 			return 0;    /* already marked */
809 		}
810 		/*
811 		 * ecn-capable but not marked, mark CE
812 		 */
813 		flowlabel |= (IPTOS_ECN_CE << 20);
814 		ip6->ip6_flow = htonl(flowlabel);
815 
816 		return 0;
817 	}
818 	default:
819 		return EPROTONOSUPPORT;
820 	}
821 }
822 
823 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)824 pktsched_mark_ecn(pktsched_pkt_t *pkt)
825 {
826 	switch (pkt->pktsched_ptype) {
827 	case QP_MBUF:
828 		return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
829 	case QP_PACKET:
830 		return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
831 	default:
832 		VERIFY(0);
833 		/* NOTREACHED */
834 		__builtin_unreachable();
835 	}
836 }
837 
838 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)839 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
840 {
841 	switch (pkt->pktsched_ptype) {
842 	case QP_MBUF: {
843 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
844 		return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
845 	}
846 	case QP_PACKET: {
847 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
848 		return (kp->pkt_pflags & PKT_F_L4S) != 0;
849 	}
850 
851 	default:
852 		VERIFY(0);
853 		/* NOTREACHED */
854 		__builtin_unreachable();
855 	}
856 	return FALSE;
857 }
858 
859 struct aqm_tag_container {
860 	struct m_tag            aqm_m_tag;
861 	uint64_t                aqm_tag;
862 };
863 
864 static struct  m_tag *
m_tag_kalloc_aqm(u_int32_t id,u_int16_t type,uint16_t len,int wait)865 m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait)
866 {
867 	struct aqm_tag_container *tag_container;
868 	struct m_tag *tag = NULL;
869 
870 	assert3u(id, ==, KERNEL_MODULE_TAG_ID);
871 	assert3u(type, ==, KERNEL_TAG_TYPE_AQM);
872 	assert3u(len, ==, sizeof(uint64_t));
873 
874 	if (len != sizeof(uint64_t)) {
875 		return NULL;
876 	}
877 
878 	tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO);
879 	if (tag_container != NULL) {
880 		tag = &tag_container->aqm_m_tag;
881 
882 		assert3p(tag, ==, tag_container);
883 
884 		M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL);
885 	}
886 
887 	return tag;
888 }
889 
890 static void
m_tag_kfree_aqm(struct m_tag * tag)891 m_tag_kfree_aqm(struct m_tag *tag)
892 {
893 	struct aqm_tag_container *__single tag_container = (struct aqm_tag_container *)tag;
894 
895 	assert3u(tag->m_tag_len, ==, sizeof(uint64_t));
896 
897 	kfree_type(struct aqm_tag_container, tag_container);
898 }
899 
900 void
pktsched_register_m_tag(void)901 pktsched_register_m_tag(void)
902 {
903 	int error;
904 
905 	error = m_register_internal_tag_type(KERNEL_TAG_TYPE_AQM, sizeof(uint64_t),
906 	    m_tag_kalloc_aqm, m_tag_kfree_aqm);
907 
908 	assert3u(error, ==, 0);
909 }
910