xref: /xnu-10063.141.1/bsd/net/pktsched/pktsched.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39 
40 #include <dev/random/randomdev.h>
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_dl.h>
44 #include <net/if_types.h>
45 #include <net/net_osdep.h>
46 #include <net/pktsched/pktsched.h>
47 #include <net/pktsched/pktsched_fq_codel.h>
48 #include <net/pktsched/pktsched_netem.h>
49 
50 #define _IP_VHL
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 
54 #include <pexpert/pexpert.h>
55 
56 #if SKYWALK
57 #include <skywalk/os_skywalk_private.h>
58 #endif /* SKYWALK */
59 
60 u_int32_t machclk_freq = 0;
61 u_int64_t machclk_per_sec = 0;
62 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
63 
64 static void init_machclk(void);
65 
66 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
67 
68 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
69     &pktsched_verbose, 0, "Packet scheduler verbosity level");
70 
71 void
pktsched_init(void)72 pktsched_init(void)
73 {
74 	init_machclk();
75 	if (machclk_freq == 0) {
76 		panic("%s: no CPU clock available!", __func__);
77 		/* NOTREACHED */
78 	}
79 	pktsched_fq_init();
80 }
81 
82 static void
init_machclk(void)83 init_machclk(void)
84 {
85 	/*
86 	 * Initialize machclk_freq using the timerbase frequency
87 	 * value from device specific info.
88 	 */
89 	machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
90 
91 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
92 	    &machclk_per_sec);
93 }
94 
95 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)96 pktsched_abs_to_nsecs(u_int64_t abstime)
97 {
98 	u_int64_t nsecs;
99 
100 	absolutetime_to_nanoseconds(abstime, &nsecs);
101 	return nsecs;
102 }
103 
104 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)105 pktsched_nsecs_to_abstime(u_int64_t nsecs)
106 {
107 	u_int64_t abstime;
108 
109 	nanoseconds_to_absolutetime(nsecs, &abstime);
110 	return abstime;
111 }
112 
113 int
pktsched_setup(struct ifclassq * ifq,u_int32_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)114 pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
115     classq_pkt_type_t ptype)
116 {
117 	int error = 0;
118 	u_int32_t rflags;
119 
120 	IFCQ_LOCK_ASSERT_HELD(ifq);
121 
122 	VERIFY(machclk_freq != 0);
123 
124 	/* Nothing to do unless the scheduler type changes */
125 	if (ifq->ifcq_type == scheduler) {
126 		return 0;
127 	}
128 
129 	/*
130 	 * Remember the flags that need to be restored upon success, as
131 	 * they may be cleared when we tear down existing scheduler.
132 	 */
133 	rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
134 
135 	if (ifq->ifcq_type != PKTSCHEDT_NONE) {
136 		pktsched_teardown(ifq);
137 
138 		/* Teardown should have succeeded */
139 		VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
140 		VERIFY(ifq->ifcq_disc == NULL);
141 	}
142 
143 	error = fq_if_setup_ifclassq(ifq, sflags, ptype);
144 	if (error == 0) {
145 		ifq->ifcq_flags |= rflags;
146 	}
147 
148 	return error;
149 }
150 
151 void
pktsched_teardown(struct ifclassq * ifq)152 pktsched_teardown(struct ifclassq *ifq)
153 {
154 	IFCQ_LOCK_ASSERT_HELD(ifq);
155 	if_qflush(ifq->ifcq_ifp, ifq, true);
156 	VERIFY(IFCQ_IS_EMPTY(ifq));
157 	ifq->ifcq_flags &= ~IFCQF_ENABLED;
158 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
159 		/* Could be PKTSCHEDT_NONE */
160 		fq_if_teardown_ifclassq(ifq);
161 	}
162 	return;
163 }
164 
165 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)166 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
167     struct if_ifclassq_stats *ifqs)
168 {
169 	int error = 0;
170 
171 	IFCQ_LOCK_ASSERT_HELD(ifq);
172 
173 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
174 		/* Could be PKTSCHEDT_NONE */
175 		error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs);
176 	}
177 
178 	return error;
179 }
180 
181 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)182 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
183 {
184 	pkt->pktsched_pkt = *cpkt;
185 	pkt->pktsched_tail = *cpkt;
186 	pkt->pktsched_pcnt = 1;
187 
188 	switch (cpkt->cp_ptype) {
189 	case QP_MBUF:
190 		pkt->pktsched_plen =
191 		    (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
192 		break;
193 
194 #if SKYWALK
195 	case QP_PACKET:
196 		pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
197 		break;
198 #endif /* SKYWALK */
199 
200 	default:
201 		VERIFY(0);
202 		/* NOTREACHED */
203 		__builtin_unreachable();
204 	}
205 }
206 
207 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)208 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
209     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
210 {
211 	pkt->pktsched_pkt = *cpkt;
212 	pkt->pktsched_tail = *tail;
213 	pkt->pktsched_pcnt = cnt;
214 	pkt->pktsched_plen = bytes;
215 
216 	switch (cpkt->cp_ptype) {
217 	case QP_MBUF:
218 		break;
219 
220 #if SKYWALK
221 	case QP_PACKET:
222 		break;
223 #endif /* SKYWALK */
224 
225 	default:
226 		VERIFY(0);
227 		/* NOTREACHED */
228 		__builtin_unreachable();
229 	}
230 }
231 
232 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)233 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
234 {
235 	struct mbuf *m1, *m2;
236 #if SKYWALK
237 	struct __kern_packet *p1;
238 	kern_packet_t ph2;
239 	int err;
240 #endif /* SKYWALK */
241 
242 	ASSERT(pkt1 != NULL);
243 	ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
244 	ASSERT(pkt1->pktsched_pcnt == 1);
245 
246 	/* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
247 	ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
248 	    pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
249 	    pkt2->pktsched_pkt_mbuf == NULL));
250 
251 	switch (pkt1->pktsched_ptype) {
252 	case QP_MBUF:
253 		m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
254 		m2 = m_dup(m1, M_NOWAIT);
255 		if (__improbable(m2 == NULL)) {
256 			return ENOBUFS;
257 		}
258 		pkt2->pktsched_pkt_mbuf = m2;
259 		break;
260 
261 #if SKYWALK
262 	case QP_PACKET:
263 		p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
264 		err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
265 		    METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
266 		    KPKT_COPY_HEAVY);
267 		if (__improbable(err != 0)) {
268 			return err;
269 		}
270 		ASSERT(ph2 != 0);
271 		VERIFY(kern_packet_finalize(ph2) == 0);
272 		pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
273 		break;
274 #endif /* SKYWALK */
275 
276 	default:
277 		VERIFY(0);
278 		/* NOTREACHED */
279 		__builtin_unreachable();
280 	}
281 
282 	pkt2->pktsched_plen = pkt1->pktsched_plen;
283 	pkt2->pktsched_ptype = pkt1->pktsched_ptype;
284 	pkt2->pktsched_tail = pkt2->pktsched_pkt;
285 	pkt2->pktsched_pcnt = 1;
286 	return 0;
287 }
288 
289 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)290 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
291 {
292 	struct mbuf *m = NULL;
293 	uint8_t *data = NULL;
294 	uint32_t data_len = 0;
295 	uint32_t rand32, rand_off, rand_bit;
296 #if SKYWALK
297 	struct __kern_packet *p = NULL;
298 #endif /* SKYWALK */
299 
300 	switch (pkt->pktsched_ptype) {
301 	case QP_MBUF:
302 		m = pkt->pktsched_pkt_mbuf;
303 		data = mtod(m, uint8_t *);
304 		data_len = m->m_pkthdr.len;
305 		break;
306 #if SKYWALK
307 	case QP_PACKET:
308 		p = pkt->pktsched_pkt_kpkt;
309 		if (p->pkt_pflags & PKT_F_MBUF_DATA) {
310 			m = p->pkt_mbuf;
311 			data = mtod(m, uint8_t *);
312 			data_len = m->m_pkthdr.len;
313 		} else {
314 			MD_BUFLET_ADDR_DLEN(p, data, data_len);
315 		}
316 		break;
317 #endif /* SKYWALK */
318 
319 	default:
320 		/* NOTREACHED */
321 		VERIFY(0);
322 		__builtin_unreachable();
323 	}
324 
325 	read_frandom(&rand32, sizeof(rand32));
326 	rand_bit = rand32 & 0x8;
327 	rand_off = (rand32 >> 3) % data_len;
328 	data[rand_off] ^= 1 << rand_bit;
329 }
330 
331 void
pktsched_free_pkt(pktsched_pkt_t * pkt)332 pktsched_free_pkt(pktsched_pkt_t *pkt)
333 {
334 	uint32_t cnt = pkt->pktsched_pcnt;
335 	ASSERT(cnt != 0);
336 
337 	switch (pkt->pktsched_ptype) {
338 	case QP_MBUF: {
339 		struct mbuf *m;
340 
341 		m = pkt->pktsched_pkt_mbuf;
342 		if (cnt == 1) {
343 			VERIFY(m->m_nextpkt == NULL);
344 		} else {
345 			VERIFY(m->m_nextpkt != NULL);
346 		}
347 		m_freem_list(m);
348 		break;
349 	}
350 #if SKYWALK
351 	case QP_PACKET: {
352 		struct __kern_packet *kpkt;
353 		int pcnt = 0;
354 
355 		kpkt = pkt->pktsched_pkt_kpkt;
356 		if (cnt == 1) {
357 			VERIFY(kpkt->pkt_nextpkt == NULL);
358 		} else {
359 			VERIFY(kpkt->pkt_nextpkt != NULL);
360 		}
361 		pp_free_packet_chain(kpkt, &pcnt);
362 		VERIFY(cnt == (uint32_t)pcnt);
363 		break;
364 	}
365 #endif /* SKYWALK */
366 
367 	default:
368 		VERIFY(0);
369 		/* NOTREACHED */
370 		__builtin_unreachable();
371 	}
372 	pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
373 	pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
374 	pkt->pktsched_plen = 0;
375 	pkt->pktsched_pcnt = 0;
376 }
377 
378 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)379 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
380 {
381 	mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
382 
383 	switch (pkt->pktsched_ptype) {
384 	case QP_MBUF:
385 		svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
386 		break;
387 
388 #if SKYWALK
389 	case QP_PACKET:
390 		svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
391 		break;
392 #endif /* SKYWALK */
393 
394 	default:
395 		VERIFY(0);
396 		/* NOTREACHED */
397 		__builtin_unreachable();
398 	}
399 
400 	return svc;
401 }
402 
403 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt,uint64_t * pkt_tx_time)404 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
405     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
406     uint32_t *comp_gencnt, uint64_t *pkt_tx_time)
407 {
408 	switch (pkt->pktsched_ptype) {
409 	case QP_MBUF: {
410 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
411 
412 		if (flags != NULL) {
413 			*flags = &pkth->pkt_flags;
414 		}
415 		if (timestamp != NULL) {
416 			*timestamp = &pkth->pkt_timestamp;
417 		}
418 		if (flowid != NULL) {
419 			*flowid = pkth->pkt_flowid;
420 		}
421 		if (flowsrc != NULL) {
422 			*flowsrc = pkth->pkt_flowsrc;
423 		}
424 		if (proto != NULL) {
425 			/*
426 			 * rdar://100524205 - We want to use the pkt_ext_flags
427 			 * to denote QUIC packets, but AQM is already written in
428 			 * such a way where IPPROTO_QUIC is used to denote QUIC
429 			 * packets.
430 			 */
431 			if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
432 				*proto = IPPROTO_QUIC;
433 			} else {
434 				*proto = pkth->pkt_proto;
435 			}
436 		}
437 		if (comp_gencnt != NULL) {
438 			*comp_gencnt = pkth->comp_gencnt;
439 		}
440 		if (pkt_tx_time != NULL) {
441 			struct m_tag *tag;
442 			tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID,
443 			    KERNEL_TAG_TYPE_AQM);
444 			if (__improbable(tag != NULL)) {
445 				*pkt_tx_time = *(uint64_t *)tag->m_tag_data;
446 			} else {
447 				*pkt_tx_time = 0;
448 			}
449 		}
450 
451 		break;
452 	}
453 
454 #if SKYWALK
455 	case QP_PACKET: {
456 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
457 
458 		if (flags != NULL) {
459 			/* use lower-32 bit for common flags */
460 			*flags = &kp->pkt_pflags32;
461 		}
462 		if (timestamp != NULL) {
463 			*timestamp = &kp->pkt_timestamp;
464 		}
465 		if (flowid != NULL) {
466 			*flowid = kp->pkt_flow_token;
467 		}
468 		if (flowsrc != NULL) {
469 			*flowsrc = (uint8_t)kp->pkt_flowsrc_type;
470 		}
471 		if (proto != NULL) {
472 			*proto = kp->pkt_transport_protocol;
473 		}
474 		if (comp_gencnt != NULL) {
475 			*comp_gencnt = kp->pkt_comp_gencnt;
476 		}
477 		if (pkt_tx_time != NULL && (kp->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0) {
478 			*pkt_tx_time = kp->pkt_com_opt->__po_pkt_tx_time;
479 		}
480 
481 		break;
482 	}
483 #endif /* SKYWALK */
484 
485 	default:
486 		VERIFY(0);
487 		/* NOTREACHED */
488 		__builtin_unreachable();
489 	}
490 }
491 
492 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)493 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
494 {
495 #pragma unused(ifp)
496 	struct flowadv_fcentry *fce = NULL;
497 
498 	switch (pkt->pktsched_ptype) {
499 	case QP_MBUF: {
500 		struct mbuf *m = pkt->pktsched_pkt_mbuf;
501 
502 		fce = flowadv_alloc_entry(how);
503 		if (fce == NULL) {
504 			break;
505 		}
506 
507 		_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
508 		    sizeof(fce->fce_flowid));
509 
510 		fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
511 		fce->fce_flowid = m->m_pkthdr.pkt_flowid;
512 #if SKYWALK
513 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
514 		    sizeof(fce->fce_flowsrc_token));
515 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
516 		    sizeof(fce->fce_flowsrc_fidx));
517 
518 		if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
519 			fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
520 			fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
521 			fce->fce_ifp = ifp;
522 		}
523 #endif /* SKYWALK */
524 		break;
525 	}
526 
527 #if SKYWALK
528 	case QP_PACKET: {
529 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
530 
531 		fce = flowadv_alloc_entry(how);
532 		if (fce == NULL) {
533 			break;
534 		}
535 
536 		_CASSERT(sizeof(fce->fce_flowid) ==
537 		    sizeof(kp->pkt_flow_token));
538 		_CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
539 		    sizeof(kp->pkt_flowsrc_fidx));
540 		_CASSERT(sizeof(fce->fce_flowsrc_token) ==
541 		    sizeof(kp->pkt_flowsrc_token));
542 
543 		ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
544 		fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
545 		fce->fce_flowid = kp->pkt_flow_token;
546 		fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
547 		fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
548 		fce->fce_ifp = ifp;
549 		break;
550 	}
551 #endif /* SKYWALK */
552 
553 	default:
554 		VERIFY(0);
555 		/* NOTREACHED */
556 		__builtin_unreachable();
557 	}
558 
559 	return fce;
560 }
561 
562 uint32_t *
pktsched_get_pkt_sfb_vars(pktsched_pkt_t * pkt,uint32_t ** sfb_flags)563 pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
564 {
565 	uint32_t *hashp = NULL;
566 
567 	switch (pkt->pktsched_ptype) {
568 	case QP_MBUF: {
569 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
570 
571 		_CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
572 		_CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
573 		*sfb_flags = &pkth->pkt_mpriv_flags;
574 		hashp = &pkth->pkt_mpriv_hash;
575 		break;
576 	}
577 
578 #if SKYWALK
579 	case QP_PACKET: {
580 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
581 
582 		_CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
583 		_CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
584 		*sfb_flags = &kp->pkt_classq_flags;
585 		hashp = &kp->pkt_classq_hash;
586 		break;
587 	}
588 #endif /* SKYWALK */
589 
590 	default:
591 		VERIFY(0);
592 		/* NOTREACHED */
593 		__builtin_unreachable();
594 	}
595 
596 	return hashp;
597 }
598 
599 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)600 pktsched_mbuf_mark_ecn(struct mbuf* m)
601 {
602 	struct mbuf     *m0;
603 	void            *hdr;
604 	int             af;
605 	uint8_t         ipv;
606 
607 	hdr = m->m_pkthdr.pkt_hdr;
608 	/* verify that hdr is within the mbuf data */
609 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
610 		if (((caddr_t)hdr >= m_mtod_current(m0)) &&
611 		    ((caddr_t)hdr < m_mtod_current(m0) + m0->m_len)) {
612 			break;
613 		}
614 	}
615 	if (m0 == NULL) {
616 		return EINVAL;
617 	}
618 	ipv = IP_VHL_V(*(uint8_t *)hdr);
619 	if (ipv == 4) {
620 		af = AF_INET;
621 	} else if (ipv == 6) {
622 		af = AF_INET6;
623 	} else {
624 		af = AF_UNSPEC;
625 	}
626 
627 	switch (af) {
628 	case AF_INET: {
629 		struct ip *ip = hdr;
630 		uint8_t otos;
631 		int sum;
632 
633 		if (((uintptr_t)ip + sizeof(*ip)) >
634 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
635 			return EINVAL;    /* out of bounds */
636 		}
637 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
638 			return EINVAL;    /* not-ECT */
639 		}
640 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
641 			return 0;    /* already marked */
642 		}
643 		/*
644 		 * ecn-capable but not marked,
645 		 * mark CE and update checksum
646 		 */
647 		otos = ip->ip_tos;
648 		ip->ip_tos |= IPTOS_ECN_CE;
649 		/*
650 		 * update checksum (from RFC1624) only if hw
651 		 * checksum is not supported.
652 		 *         HC' = ~(~HC + ~m + m')
653 		 */
654 		if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
655 			sum = ~ntohs(ip->ip_sum) & 0xffff;
656 			sum += (~otos & 0xffff) + ip->ip_tos;
657 			sum = (sum >> 16) + (sum & 0xffff);
658 			sum += (sum >> 16); /* add carry */
659 			ip->ip_sum = htons(~sum & 0xffff);
660 		}
661 		return 0;
662 	}
663 	case AF_INET6: {
664 		struct ip6_hdr *ip6 = hdr;
665 		u_int32_t flowlabel;
666 
667 		if (((uintptr_t)ip6 + sizeof(*ip6)) >
668 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
669 			return EINVAL;    /* out of bounds */
670 		}
671 		flowlabel = ntohl(ip6->ip6_flow);
672 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
673 		    (IPTOS_ECN_NOTECT << 20)) {
674 			return EINVAL;    /* not-ECT */
675 		}
676 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
677 		    (IPTOS_ECN_CE << 20)) {
678 			return 0;    /* already marked */
679 		}
680 		/*
681 		 * ecn-capable but not marked,  mark CE
682 		 */
683 		flowlabel |= (IPTOS_ECN_CE << 20);
684 		ip6->ip6_flow = htonl(flowlabel);
685 		return 0;
686 	}
687 	default:
688 		return EPROTONOSUPPORT;
689 	}
690 }
691 
692 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)693 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
694 {
695 	uint8_t ipv = 0, *l3_hdr;
696 
697 	if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) {
698 		ipv = kpkt->pkt_flow_ip_ver;
699 		l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr;
700 	} else {
701 		uint8_t *pkt_buf;
702 		uint32_t bdlen, bdlim, bdoff;
703 		MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
704 
705 		/* takes care of both IPv4 and IPv6 */
706 		l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
707 		ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
708 		if (ipv == 4) {
709 			ipv = IPVERSION;
710 		} else if (ipv == 6) {
711 			ipv = IPV6_VERSION;
712 		} else {
713 			ipv = 0;
714 		}
715 	}
716 
717 	switch (ipv) {
718 	case IPVERSION: {
719 		uint8_t otos;
720 		int sum;
721 
722 		struct ip *ip = (struct ip *)(void *)l3_hdr;
723 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
724 			return EINVAL;    /* not-ECT */
725 		}
726 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
727 			return 0;    /* already marked */
728 		}
729 		/*
730 		 * ecn-capable but not marked,
731 		 * mark CE and update checksum
732 		 */
733 		otos = ip->ip_tos;
734 		ip->ip_tos |= IPTOS_ECN_CE;
735 
736 		sum = ~ntohs(ip->ip_sum) & 0xffff;
737 		sum += (~otos & 0xffff) + ip->ip_tos;
738 		sum = (sum >> 16) + (sum & 0xffff);
739 		sum += (sum >> 16); /* add carry */
740 		ip->ip_sum = htons(~sum & 0xffff);
741 
742 		return 0;
743 	}
744 	case IPV6_VERSION: {
745 		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
746 		u_int32_t flowlabel;
747 		flowlabel = ntohl(ip6->ip6_flow);
748 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
749 		    (IPTOS_ECN_NOTECT << 20)) {
750 			return EINVAL;    /* not-ECT */
751 		}
752 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
753 		    (IPTOS_ECN_CE << 20)) {
754 			return 0;    /* already marked */
755 		}
756 		/*
757 		 * ecn-capable but not marked, mark CE
758 		 */
759 		flowlabel |= (IPTOS_ECN_CE << 20);
760 		ip6->ip6_flow = htonl(flowlabel);
761 
762 		return 0;
763 	}
764 	default:
765 		return EPROTONOSUPPORT;
766 	}
767 }
768 
769 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)770 pktsched_mark_ecn(pktsched_pkt_t *pkt)
771 {
772 	switch (pkt->pktsched_ptype) {
773 	case QP_MBUF:
774 		return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
775 	case QP_PACKET:
776 		return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
777 	default:
778 		VERIFY(0);
779 		/* NOTREACHED */
780 		__builtin_unreachable();
781 	}
782 }
783 
784 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)785 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
786 {
787 	switch (pkt->pktsched_ptype) {
788 	case QP_MBUF: {
789 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
790 		return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
791 	}
792 	case QP_PACKET: {
793 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
794 		return (kp->pkt_pflags & PKT_F_L4S) != 0;
795 	}
796 
797 	default:
798 		VERIFY(0);
799 		/* NOTREACHED */
800 		__builtin_unreachable();
801 	}
802 	return FALSE;
803 }
804 
805 struct aqm_tag_container {
806 	struct m_tag            aqm_m_tag;
807 	uint64_t                aqm_tag;
808 };
809 
810 static struct  m_tag *
m_tag_kalloc_aqm(u_int32_t id,u_int16_t type,uint16_t len,int wait)811 m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait)
812 {
813 	struct aqm_tag_container *tag_container;
814 	struct m_tag *tag = NULL;
815 
816 	assert3u(id, ==, KERNEL_MODULE_TAG_ID);
817 	assert3u(type, ==, KERNEL_TAG_TYPE_AQM);
818 	assert3u(len, ==, sizeof(uint64_t));
819 
820 	if (len != sizeof(uint64_t)) {
821 		return NULL;
822 	}
823 
824 	tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO);
825 	if (tag_container != NULL) {
826 		tag = &tag_container->aqm_m_tag;
827 
828 		assert3p(tag, ==, tag_container);
829 
830 		M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL);
831 	}
832 
833 	return tag;
834 }
835 
836 static void
m_tag_kfree_aqm(struct m_tag * tag)837 m_tag_kfree_aqm(struct m_tag *tag)
838 {
839 	struct aqm_tag_container *tag_container = (struct aqm_tag_container *)tag;
840 
841 	assert3u(tag->m_tag_len, ==, sizeof(uint64_t));
842 
843 	kfree_type(struct aqm_tag_container, tag_container);
844 }
845 
846 void
pktsched_register_m_tag(void)847 pktsched_register_m_tag(void)
848 {
849 	int error;
850 
851 	error = m_register_internal_tag_type(KERNEL_TAG_TYPE_AQM, sizeof(uint64_t),
852 	    m_tag_kalloc_aqm, m_tag_kfree_aqm);
853 
854 	assert3u(error, ==, 0);
855 }
856