xref: /xnu-8792.81.2/bsd/net/pktsched/pktsched.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39 
40 #include <dev/random/randomdev.h>
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_dl.h>
44 #include <net/if_types.h>
45 #include <net/net_osdep.h>
46 #include <net/pktsched/pktsched.h>
47 #include <net/pktsched/pktsched_fq_codel.h>
48 #include <net/pktsched/pktsched_netem.h>
49 
50 #define _IP_VHL
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 
54 #include <pexpert/pexpert.h>
55 
56 #if SKYWALK
57 #include <skywalk/os_skywalk_private.h>
58 #endif /* SKYWALK */
59 
60 u_int32_t machclk_freq = 0;
61 u_int64_t machclk_per_sec = 0;
62 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
63 
64 static void init_machclk(void);
65 
66 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
67 
68 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
69     &pktsched_verbose, 0, "Packet scheduler verbosity level");
70 
71 void
pktsched_init(void)72 pktsched_init(void)
73 {
74 	init_machclk();
75 	if (machclk_freq == 0) {
76 		panic("%s: no CPU clock available!", __func__);
77 		/* NOTREACHED */
78 	}
79 	pktsched_fq_init();
80 }
81 
82 static void
init_machclk(void)83 init_machclk(void)
84 {
85 	/*
86 	 * Initialize machclk_freq using the timerbase frequency
87 	 * value from device specific info.
88 	 */
89 	machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
90 
91 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
92 	    &machclk_per_sec);
93 }
94 
95 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)96 pktsched_abs_to_nsecs(u_int64_t abstime)
97 {
98 	u_int64_t nsecs;
99 
100 	absolutetime_to_nanoseconds(abstime, &nsecs);
101 	return nsecs;
102 }
103 
104 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)105 pktsched_nsecs_to_abstime(u_int64_t nsecs)
106 {
107 	u_int64_t abstime;
108 
109 	nanoseconds_to_absolutetime(nsecs, &abstime);
110 	return abstime;
111 }
112 
113 int
pktsched_setup(struct ifclassq * ifq,u_int32_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)114 pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
115     classq_pkt_type_t ptype)
116 {
117 	int error = 0;
118 	u_int32_t rflags;
119 
120 	IFCQ_LOCK_ASSERT_HELD(ifq);
121 
122 	VERIFY(machclk_freq != 0);
123 
124 	/* Nothing to do unless the scheduler type changes */
125 	if (ifq->ifcq_type == scheduler) {
126 		return 0;
127 	}
128 
129 	/*
130 	 * Remember the flags that need to be restored upon success, as
131 	 * they may be cleared when we tear down existing scheduler.
132 	 */
133 	rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
134 
135 	if (ifq->ifcq_type != PKTSCHEDT_NONE) {
136 		pktsched_teardown(ifq);
137 
138 		/* Teardown should have succeeded */
139 		VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
140 		VERIFY(ifq->ifcq_disc == NULL);
141 	}
142 
143 	error = fq_if_setup_ifclassq(ifq, sflags, ptype);
144 	if (error == 0) {
145 		ifq->ifcq_flags |= rflags;
146 	}
147 
148 	return error;
149 }
150 
151 void
pktsched_teardown(struct ifclassq * ifq)152 pktsched_teardown(struct ifclassq *ifq)
153 {
154 	IFCQ_LOCK_ASSERT_HELD(ifq);
155 	if_qflush(ifq->ifcq_ifp, ifq, true);
156 	VERIFY(IFCQ_IS_EMPTY(ifq));
157 	ifq->ifcq_flags &= ~IFCQF_ENABLED;
158 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
159 		/* Could be PKTSCHEDT_NONE */
160 		fq_if_teardown_ifclassq(ifq);
161 	}
162 	return;
163 }
164 
165 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)166 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
167     struct if_ifclassq_stats *ifqs)
168 {
169 	int error = 0;
170 
171 	IFCQ_LOCK_ASSERT_HELD(ifq);
172 
173 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
174 		/* Could be PKTSCHEDT_NONE */
175 		error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs);
176 	}
177 
178 	return error;
179 }
180 
181 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)182 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
183 {
184 	pkt->pktsched_pkt = *cpkt;
185 	pkt->pktsched_tail = *cpkt;
186 	pkt->pktsched_pcnt = 1;
187 
188 	switch (cpkt->cp_ptype) {
189 	case QP_MBUF:
190 		pkt->pktsched_plen =
191 		    (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
192 		break;
193 
194 #if SKYWALK
195 	case QP_PACKET:
196 		pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
197 		break;
198 #endif /* SKYWALK */
199 
200 	default:
201 		VERIFY(0);
202 		/* NOTREACHED */
203 		__builtin_unreachable();
204 	}
205 }
206 
207 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)208 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
209     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
210 {
211 	pkt->pktsched_pkt = *cpkt;
212 	pkt->pktsched_tail = *tail;
213 	pkt->pktsched_pcnt = cnt;
214 	pkt->pktsched_plen = bytes;
215 
216 	switch (cpkt->cp_ptype) {
217 	case QP_MBUF:
218 		break;
219 
220 #if SKYWALK
221 	case QP_PACKET:
222 		break;
223 #endif /* SKYWALK */
224 
225 	default:
226 		VERIFY(0);
227 		/* NOTREACHED */
228 		__builtin_unreachable();
229 	}
230 }
231 
232 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)233 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
234 {
235 	struct mbuf *m1, *m2;
236 #if SKYWALK
237 	struct __kern_packet *p1;
238 	kern_packet_t ph2;
239 	int err;
240 #endif /* SKYWALK */
241 
242 	ASSERT(pkt1 != NULL);
243 	ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
244 	ASSERT(pkt1->pktsched_pcnt == 1);
245 
246 	/* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
247 	ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
248 	    pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
249 	    pkt2->pktsched_pkt_mbuf == NULL));
250 
251 	switch (pkt1->pktsched_ptype) {
252 	case QP_MBUF:
253 		m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
254 		m2 = m_dup(m1, M_NOWAIT);
255 		if (__improbable(m2 == NULL)) {
256 			return ENOBUFS;
257 		}
258 		pkt2->pktsched_pkt_mbuf = m2;
259 		break;
260 
261 #if SKYWALK
262 	case QP_PACKET:
263 		p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
264 		err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
265 		    METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
266 		    KPKT_COPY_HEAVY);
267 		if (__improbable(err != 0)) {
268 			return err;
269 		}
270 		ASSERT(ph2 != 0);
271 		VERIFY(kern_packet_finalize(ph2) == 0);
272 		pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
273 		break;
274 #endif /* SKYWALK */
275 
276 	default:
277 		VERIFY(0);
278 		/* NOTREACHED */
279 		__builtin_unreachable();
280 	}
281 
282 	pkt2->pktsched_plen = pkt1->pktsched_plen;
283 	pkt2->pktsched_ptype = pkt1->pktsched_ptype;
284 	pkt2->pktsched_tail = pkt2->pktsched_pkt;
285 	pkt2->pktsched_pcnt = 1;
286 	return 0;
287 }
288 
289 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)290 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
291 {
292 	struct mbuf *m = NULL;
293 	uint8_t *data = NULL;
294 	uint32_t data_len = 0;
295 	uint32_t rand32, rand_off, rand_bit;
296 #if SKYWALK
297 	struct __kern_packet *p = NULL;
298 #endif /* SKYWALK */
299 
300 	switch (pkt->pktsched_ptype) {
301 	case QP_MBUF:
302 		m = pkt->pktsched_pkt_mbuf;
303 		data = mtod(m, uint8_t *);
304 		data_len = m->m_pkthdr.len;
305 		break;
306 #if SKYWALK
307 	case QP_PACKET:
308 		p = pkt->pktsched_pkt_kpkt;
309 		if (p->pkt_pflags & PKT_F_MBUF_DATA) {
310 			m = p->pkt_mbuf;
311 			data = mtod(m, uint8_t *);
312 			data_len = m->m_pkthdr.len;
313 		} else {
314 			MD_BUFLET_ADDR_DLEN(p, data, data_len);
315 		}
316 		break;
317 #endif /* SKYWALK */
318 
319 	default:
320 		/* NOTREACHED */
321 		VERIFY(0);
322 		__builtin_unreachable();
323 	}
324 
325 	read_frandom(&rand32, sizeof(rand32));
326 	rand_bit = rand32 & 0x8;
327 	rand_off = (rand32 >> 3) % data_len;
328 	data[rand_off] ^= 1 << rand_bit;
329 }
330 
331 void
pktsched_free_pkt(pktsched_pkt_t * pkt)332 pktsched_free_pkt(pktsched_pkt_t *pkt)
333 {
334 	uint32_t cnt = pkt->pktsched_pcnt;
335 	ASSERT(cnt != 0);
336 
337 	switch (pkt->pktsched_ptype) {
338 	case QP_MBUF: {
339 		struct mbuf *m;
340 
341 		m = pkt->pktsched_pkt_mbuf;
342 		if (cnt == 1) {
343 			VERIFY(m->m_nextpkt == NULL);
344 		} else {
345 			VERIFY(m->m_nextpkt != NULL);
346 		}
347 		m_freem_list(m);
348 		break;
349 	}
350 #if SKYWALK
351 	case QP_PACKET: {
352 		struct __kern_packet *kpkt;
353 		int pcnt = 0;
354 
355 		kpkt = pkt->pktsched_pkt_kpkt;
356 		if (cnt == 1) {
357 			VERIFY(kpkt->pkt_nextpkt == NULL);
358 		} else {
359 			VERIFY(kpkt->pkt_nextpkt != NULL);
360 		}
361 		pp_free_packet_chain(kpkt, &pcnt);
362 		VERIFY(cnt == (uint32_t)pcnt);
363 		break;
364 	}
365 #endif /* SKYWALK */
366 
367 	default:
368 		VERIFY(0);
369 		/* NOTREACHED */
370 		__builtin_unreachable();
371 	}
372 	pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
373 	pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
374 	pkt->pktsched_plen = 0;
375 	pkt->pktsched_pcnt = 0;
376 }
377 
378 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)379 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
380 {
381 	mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
382 
383 	switch (pkt->pktsched_ptype) {
384 	case QP_MBUF:
385 		svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
386 		break;
387 
388 #if SKYWALK
389 	case QP_PACKET:
390 		svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
391 		break;
392 #endif /* SKYWALK */
393 
394 	default:
395 		VERIFY(0);
396 		/* NOTREACHED */
397 		__builtin_unreachable();
398 	}
399 
400 	return svc;
401 }
402 
403 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt)404 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
405     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
406     uint32_t *comp_gencnt)
407 {
408 	switch (pkt->pktsched_ptype) {
409 	case QP_MBUF: {
410 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
411 
412 		if (flags != NULL) {
413 			*flags = &pkth->pkt_flags;
414 		}
415 		if (timestamp != NULL) {
416 			*timestamp = &pkth->pkt_timestamp;
417 		}
418 		if (flowid != NULL) {
419 			*flowid = pkth->pkt_flowid;
420 		}
421 		if (flowsrc != NULL) {
422 			*flowsrc = pkth->pkt_flowsrc;
423 		}
424 		if (proto != NULL) {
425 			*proto = pkth->pkt_proto;
426 		}
427 		if (comp_gencnt != NULL) {
428 			*comp_gencnt = pkth->comp_gencnt;
429 		}
430 
431 		break;
432 	}
433 
434 #if SKYWALK
435 	case QP_PACKET: {
436 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
437 
438 		if (flags != NULL) {
439 			/* use lower-32 bit for common flags */
440 			*flags = &kp->pkt_pflags32;
441 		}
442 		if (timestamp != NULL) {
443 			*timestamp = &kp->pkt_timestamp;
444 		}
445 		if (flowid != NULL) {
446 			*flowid = kp->pkt_flow_token;
447 		}
448 		if (flowsrc != NULL) {
449 			*flowsrc = (uint8_t)kp->pkt_flowsrc_type;
450 		}
451 		if (proto != NULL) {
452 			*proto = kp->pkt_transport_protocol;
453 		}
454 		if (comp_gencnt != NULL) {
455 			*comp_gencnt = kp->pkt_comp_gencnt;
456 		}
457 
458 		break;
459 	}
460 #endif /* SKYWALK */
461 
462 	default:
463 		VERIFY(0);
464 		/* NOTREACHED */
465 		__builtin_unreachable();
466 	}
467 }
468 
469 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)470 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
471 {
472 #pragma unused(ifp)
473 	struct flowadv_fcentry *fce = NULL;
474 
475 	switch (pkt->pktsched_ptype) {
476 	case QP_MBUF: {
477 		struct mbuf *m = pkt->pktsched_pkt_mbuf;
478 
479 		fce = flowadv_alloc_entry(how);
480 		if (fce == NULL) {
481 			break;
482 		}
483 
484 		_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
485 		    sizeof(fce->fce_flowid));
486 
487 		fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
488 		fce->fce_flowid = m->m_pkthdr.pkt_flowid;
489 #if SKYWALK
490 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
491 		    sizeof(fce->fce_flowsrc_token));
492 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
493 		    sizeof(fce->fce_flowsrc_fidx));
494 
495 		if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
496 			fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
497 			fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
498 			fce->fce_ifp = ifp;
499 		}
500 #endif /* SKYWALK */
501 		break;
502 	}
503 
504 #if SKYWALK
505 	case QP_PACKET: {
506 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
507 
508 		fce = flowadv_alloc_entry(how);
509 		if (fce == NULL) {
510 			break;
511 		}
512 
513 		_CASSERT(sizeof(fce->fce_flowid) ==
514 		    sizeof(kp->pkt_flow_token));
515 		_CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
516 		    sizeof(kp->pkt_flowsrc_fidx));
517 		_CASSERT(sizeof(fce->fce_flowsrc_token) ==
518 		    sizeof(kp->pkt_flowsrc_token));
519 
520 		ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
521 		fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
522 		fce->fce_flowid = kp->pkt_flow_token;
523 		fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
524 		fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
525 		fce->fce_ifp = ifp;
526 		break;
527 	}
528 #endif /* SKYWALK */
529 
530 	default:
531 		VERIFY(0);
532 		/* NOTREACHED */
533 		__builtin_unreachable();
534 	}
535 
536 	return fce;
537 }
538 
539 uint32_t *
pktsched_get_pkt_sfb_vars(pktsched_pkt_t * pkt,uint32_t ** sfb_flags)540 pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
541 {
542 	uint32_t *hashp = NULL;
543 
544 	switch (pkt->pktsched_ptype) {
545 	case QP_MBUF: {
546 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
547 
548 		_CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
549 		_CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
550 		*sfb_flags = &pkth->pkt_mpriv_flags;
551 		hashp = &pkth->pkt_mpriv_hash;
552 		break;
553 	}
554 
555 #if SKYWALK
556 	case QP_PACKET: {
557 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
558 
559 		_CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
560 		_CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
561 		*sfb_flags = &kp->pkt_classq_flags;
562 		hashp = &kp->pkt_classq_hash;
563 		break;
564 	}
565 #endif /* SKYWALK */
566 
567 	default:
568 		VERIFY(0);
569 		/* NOTREACHED */
570 		__builtin_unreachable();
571 	}
572 
573 	return hashp;
574 }
575 
576 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)577 pktsched_mbuf_mark_ecn(struct mbuf* m)
578 {
579 	struct mbuf     *m0;
580 	void            *hdr;
581 	int             af;
582 	uint8_t         ipv;
583 
584 	hdr = m->m_pkthdr.pkt_hdr;
585 	/* verify that hdr is within the mbuf data */
586 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
587 		if (((caddr_t)hdr >= m0->m_data) &&
588 		    ((caddr_t)hdr < m0->m_data + m0->m_len)) {
589 			break;
590 		}
591 	}
592 	if (m0 == NULL) {
593 		return EINVAL;
594 	}
595 	ipv = IP_VHL_V(*(uint8_t *)hdr);
596 	if (ipv == 4) {
597 		af = AF_INET;
598 	} else if (ipv == 6) {
599 		af = AF_INET6;
600 	} else {
601 		af = AF_UNSPEC;
602 	}
603 
604 	switch (af) {
605 	case AF_INET: {
606 		struct ip *ip = hdr;
607 		uint8_t otos;
608 		int sum;
609 
610 		if (((uintptr_t)ip + sizeof(*ip)) >
611 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
612 			return EINVAL;    /* out of bounds */
613 		}
614 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
615 			return EINVAL;    /* not-ECT */
616 		}
617 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
618 			return 0;    /* already marked */
619 		}
620 		/*
621 		 * ecn-capable but not marked,
622 		 * mark CE and update checksum
623 		 */
624 		otos = ip->ip_tos;
625 		ip->ip_tos |= IPTOS_ECN_CE;
626 		/*
627 		 * update checksum (from RFC1624) only if hw
628 		 * checksum is not supported.
629 		 *         HC' = ~(~HC + ~m + m')
630 		 */
631 		if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
632 			sum = ~ntohs(ip->ip_sum) & 0xffff;
633 			sum += (~otos & 0xffff) + ip->ip_tos;
634 			sum = (sum >> 16) + (sum & 0xffff);
635 			sum += (sum >> 16); /* add carry */
636 			ip->ip_sum = htons(~sum & 0xffff);
637 		}
638 		return 0;
639 	}
640 	case AF_INET6: {
641 		struct ip6_hdr *ip6 = hdr;
642 		u_int32_t flowlabel;
643 
644 		if (((uintptr_t)ip6 + sizeof(*ip6)) >
645 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
646 			return EINVAL;    /* out of bounds */
647 		}
648 		flowlabel = ntohl(ip6->ip6_flow);
649 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
650 		    (IPTOS_ECN_NOTECT << 20)) {
651 			return EINVAL;    /* not-ECT */
652 		}
653 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
654 		    (IPTOS_ECN_CE << 20)) {
655 			return 0;    /* already marked */
656 		}
657 		/*
658 		 * ecn-capable but not marked,  mark CE
659 		 */
660 		flowlabel |= (IPTOS_ECN_CE << 20);
661 		ip6->ip6_flow = htonl(flowlabel);
662 		return 0;
663 	}
664 	default:
665 		return EPROTONOSUPPORT;
666 	}
667 }
668 
669 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)670 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
671 {
672 	uint8_t ipv = 0, *l3_hdr;
673 
674 	if (__improbable((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
675 		ipv = kpkt->pkt_flow_ip_ver;
676 		l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr;
677 	} else {
678 		uint8_t *pkt_buf;
679 		uint16_t bdlen, bdlim, bdoff;
680 		MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
681 
682 		/* takes care of both IPv4 and IPv6 */
683 		l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
684 		ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
685 		if (ipv == 4) {
686 			ipv = IPVERSION;
687 		} else if (ipv == 6) {
688 			ipv = IPV6_VERSION;
689 		} else {
690 			ipv = 0;
691 		}
692 	}
693 
694 	switch (ipv) {
695 	case IPVERSION: {
696 		uint8_t otos;
697 		int sum;
698 
699 		struct ip *ip = (struct ip *)(void *)l3_hdr;
700 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
701 			return EINVAL;    /* not-ECT */
702 		}
703 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
704 			return 0;    /* already marked */
705 		}
706 		/*
707 		 * ecn-capable but not marked,
708 		 * mark CE and update checksum
709 		 */
710 		otos = ip->ip_tos;
711 		ip->ip_tos |= IPTOS_ECN_CE;
712 
713 		sum = ~ntohs(ip->ip_sum) & 0xffff;
714 		sum += (~otos & 0xffff) + ip->ip_tos;
715 		sum = (sum >> 16) + (sum & 0xffff);
716 		sum += (sum >> 16); /* add carry */
717 		ip->ip_sum = htons(~sum & 0xffff);
718 
719 		return 0;
720 	}
721 	case IPV6_VERSION: {
722 		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
723 		u_int32_t flowlabel;
724 		flowlabel = ntohl(ip6->ip6_flow);
725 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
726 		    (IPTOS_ECN_NOTECT << 20)) {
727 			return EINVAL;    /* not-ECT */
728 		}
729 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
730 		    (IPTOS_ECN_CE << 20)) {
731 			return 0;    /* already marked */
732 		}
733 		/*
734 		 * ecn-capable but not marked, mark CE
735 		 */
736 		flowlabel |= (IPTOS_ECN_CE << 20);
737 		ip6->ip6_flow = htonl(flowlabel);
738 
739 		return 0;
740 	}
741 	default:
742 		return EPROTONOSUPPORT;
743 	}
744 }
745 
746 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)747 pktsched_mark_ecn(pktsched_pkt_t *pkt)
748 {
749 	switch (pkt->pktsched_ptype) {
750 	case QP_MBUF:
751 		return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
752 	case QP_PACKET:
753 		return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
754 	default:
755 		VERIFY(0);
756 		/* NOTREACHED */
757 		__builtin_unreachable();
758 	}
759 }
760 
761 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)762 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
763 {
764 	switch (pkt->pktsched_ptype) {
765 	case QP_MBUF: {
766 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
767 		return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
768 	}
769 	case QP_PACKET: {
770 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
771 		return (kp->pkt_pflags & PKT_F_L4S) != 0;
772 	}
773 
774 	default:
775 		VERIFY(0);
776 		/* NOTREACHED */
777 		__builtin_unreachable();
778 	}
779 	return FALSE;
780 }
781