xref: /xnu-8796.121.2/bsd/net/pktsched/pktsched.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/errno.h>
37 #include <sys/mcache.h>
38 #include <sys/sysctl.h>
39 
40 #include <dev/random/randomdev.h>
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_dl.h>
44 #include <net/if_types.h>
45 #include <net/net_osdep.h>
46 #include <net/pktsched/pktsched.h>
47 #include <net/pktsched/pktsched_fq_codel.h>
48 #include <net/pktsched/pktsched_netem.h>
49 
50 #define _IP_VHL
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 
54 #include <pexpert/pexpert.h>
55 
56 #if SKYWALK
57 #include <skywalk/os_skywalk_private.h>
58 #endif /* SKYWALK */
59 
60 u_int32_t machclk_freq = 0;
61 u_int64_t machclk_per_sec = 0;
62 u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
63 
64 static void init_machclk(void);
65 
66 SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
67 
68 SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
69     &pktsched_verbose, 0, "Packet scheduler verbosity level");
70 
71 void
pktsched_init(void)72 pktsched_init(void)
73 {
74 	init_machclk();
75 	if (machclk_freq == 0) {
76 		panic("%s: no CPU clock available!", __func__);
77 		/* NOTREACHED */
78 	}
79 	pktsched_fq_init();
80 }
81 
82 static void
init_machclk(void)83 init_machclk(void)
84 {
85 	/*
86 	 * Initialize machclk_freq using the timerbase frequency
87 	 * value from device specific info.
88 	 */
89 	machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
90 
91 	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC,
92 	    &machclk_per_sec);
93 }
94 
95 u_int64_t
pktsched_abs_to_nsecs(u_int64_t abstime)96 pktsched_abs_to_nsecs(u_int64_t abstime)
97 {
98 	u_int64_t nsecs;
99 
100 	absolutetime_to_nanoseconds(abstime, &nsecs);
101 	return nsecs;
102 }
103 
104 u_int64_t
pktsched_nsecs_to_abstime(u_int64_t nsecs)105 pktsched_nsecs_to_abstime(u_int64_t nsecs)
106 {
107 	u_int64_t abstime;
108 
109 	nanoseconds_to_absolutetime(nsecs, &abstime);
110 	return abstime;
111 }
112 
113 int
pktsched_setup(struct ifclassq * ifq,u_int32_t scheduler,u_int32_t sflags,classq_pkt_type_t ptype)114 pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
115     classq_pkt_type_t ptype)
116 {
117 	int error = 0;
118 	u_int32_t rflags;
119 
120 	IFCQ_LOCK_ASSERT_HELD(ifq);
121 
122 	VERIFY(machclk_freq != 0);
123 
124 	/* Nothing to do unless the scheduler type changes */
125 	if (ifq->ifcq_type == scheduler) {
126 		return 0;
127 	}
128 
129 	/*
130 	 * Remember the flags that need to be restored upon success, as
131 	 * they may be cleared when we tear down existing scheduler.
132 	 */
133 	rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
134 
135 	if (ifq->ifcq_type != PKTSCHEDT_NONE) {
136 		pktsched_teardown(ifq);
137 
138 		/* Teardown should have succeeded */
139 		VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
140 		VERIFY(ifq->ifcq_disc == NULL);
141 	}
142 
143 	error = fq_if_setup_ifclassq(ifq, sflags, ptype);
144 	if (error == 0) {
145 		ifq->ifcq_flags |= rflags;
146 	}
147 
148 	return error;
149 }
150 
151 void
pktsched_teardown(struct ifclassq * ifq)152 pktsched_teardown(struct ifclassq *ifq)
153 {
154 	IFCQ_LOCK_ASSERT_HELD(ifq);
155 	if_qflush(ifq->ifcq_ifp, ifq, true);
156 	VERIFY(IFCQ_IS_EMPTY(ifq));
157 	ifq->ifcq_flags &= ~IFCQF_ENABLED;
158 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
159 		/* Could be PKTSCHEDT_NONE */
160 		fq_if_teardown_ifclassq(ifq);
161 	}
162 	return;
163 }
164 
165 int
pktsched_getqstats(struct ifclassq * ifq,u_int32_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)166 pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
167     struct if_ifclassq_stats *ifqs)
168 {
169 	int error = 0;
170 
171 	IFCQ_LOCK_ASSERT_HELD(ifq);
172 
173 	if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
174 		/* Could be PKTSCHEDT_NONE */
175 		error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs);
176 	}
177 
178 	return error;
179 }
180 
181 void
pktsched_pkt_encap(pktsched_pkt_t * pkt,classq_pkt_t * cpkt)182 pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
183 {
184 	pkt->pktsched_pkt = *cpkt;
185 	pkt->pktsched_tail = *cpkt;
186 	pkt->pktsched_pcnt = 1;
187 
188 	switch (cpkt->cp_ptype) {
189 	case QP_MBUF:
190 		pkt->pktsched_plen =
191 		    (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
192 		break;
193 
194 #if SKYWALK
195 	case QP_PACKET:
196 		pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
197 		break;
198 #endif /* SKYWALK */
199 
200 	default:
201 		VERIFY(0);
202 		/* NOTREACHED */
203 		__builtin_unreachable();
204 	}
205 }
206 
207 void
pktsched_pkt_encap_chain(pktsched_pkt_t * pkt,classq_pkt_t * cpkt,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes)208 pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
209     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
210 {
211 	pkt->pktsched_pkt = *cpkt;
212 	pkt->pktsched_tail = *tail;
213 	pkt->pktsched_pcnt = cnt;
214 	pkt->pktsched_plen = bytes;
215 
216 	switch (cpkt->cp_ptype) {
217 	case QP_MBUF:
218 		break;
219 
220 #if SKYWALK
221 	case QP_PACKET:
222 		break;
223 #endif /* SKYWALK */
224 
225 	default:
226 		VERIFY(0);
227 		/* NOTREACHED */
228 		__builtin_unreachable();
229 	}
230 }
231 
232 int
pktsched_clone_pkt(pktsched_pkt_t * pkt1,pktsched_pkt_t * pkt2)233 pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
234 {
235 	struct mbuf *m1, *m2;
236 #if SKYWALK
237 	struct __kern_packet *p1;
238 	kern_packet_t ph2;
239 	int err;
240 #endif /* SKYWALK */
241 
242 	ASSERT(pkt1 != NULL);
243 	ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
244 	ASSERT(pkt1->pktsched_pcnt == 1);
245 
246 	/* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
247 	ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
248 	    pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
249 	    pkt2->pktsched_pkt_mbuf == NULL));
250 
251 	switch (pkt1->pktsched_ptype) {
252 	case QP_MBUF:
253 		m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
254 		m2 = m_dup(m1, M_NOWAIT);
255 		if (__improbable(m2 == NULL)) {
256 			return ENOBUFS;
257 		}
258 		pkt2->pktsched_pkt_mbuf = m2;
259 		break;
260 
261 #if SKYWALK
262 	case QP_PACKET:
263 		p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
264 		err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
265 		    METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
266 		    KPKT_COPY_HEAVY);
267 		if (__improbable(err != 0)) {
268 			return err;
269 		}
270 		ASSERT(ph2 != 0);
271 		VERIFY(kern_packet_finalize(ph2) == 0);
272 		pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
273 		break;
274 #endif /* SKYWALK */
275 
276 	default:
277 		VERIFY(0);
278 		/* NOTREACHED */
279 		__builtin_unreachable();
280 	}
281 
282 	pkt2->pktsched_plen = pkt1->pktsched_plen;
283 	pkt2->pktsched_ptype = pkt1->pktsched_ptype;
284 	pkt2->pktsched_tail = pkt2->pktsched_pkt;
285 	pkt2->pktsched_pcnt = 1;
286 	return 0;
287 }
288 
289 void
pktsched_corrupt_packet(pktsched_pkt_t * pkt)290 pktsched_corrupt_packet(pktsched_pkt_t *pkt)
291 {
292 	struct mbuf *m = NULL;
293 	uint8_t *data = NULL;
294 	uint32_t data_len = 0;
295 	uint32_t rand32, rand_off, rand_bit;
296 #if SKYWALK
297 	struct __kern_packet *p = NULL;
298 #endif /* SKYWALK */
299 
300 	switch (pkt->pktsched_ptype) {
301 	case QP_MBUF:
302 		m = pkt->pktsched_pkt_mbuf;
303 		data = mtod(m, uint8_t *);
304 		data_len = m->m_pkthdr.len;
305 		break;
306 #if SKYWALK
307 	case QP_PACKET:
308 		p = pkt->pktsched_pkt_kpkt;
309 		if (p->pkt_pflags & PKT_F_MBUF_DATA) {
310 			m = p->pkt_mbuf;
311 			data = mtod(m, uint8_t *);
312 			data_len = m->m_pkthdr.len;
313 		} else {
314 			MD_BUFLET_ADDR_DLEN(p, data, data_len);
315 		}
316 		break;
317 #endif /* SKYWALK */
318 
319 	default:
320 		/* NOTREACHED */
321 		VERIFY(0);
322 		__builtin_unreachable();
323 	}
324 
325 	read_frandom(&rand32, sizeof(rand32));
326 	rand_bit = rand32 & 0x8;
327 	rand_off = (rand32 >> 3) % data_len;
328 	data[rand_off] ^= 1 << rand_bit;
329 }
330 
331 void
pktsched_free_pkt(pktsched_pkt_t * pkt)332 pktsched_free_pkt(pktsched_pkt_t *pkt)
333 {
334 	uint32_t cnt = pkt->pktsched_pcnt;
335 	ASSERT(cnt != 0);
336 
337 	switch (pkt->pktsched_ptype) {
338 	case QP_MBUF: {
339 		struct mbuf *m;
340 
341 		m = pkt->pktsched_pkt_mbuf;
342 		if (cnt == 1) {
343 			VERIFY(m->m_nextpkt == NULL);
344 		} else {
345 			VERIFY(m->m_nextpkt != NULL);
346 		}
347 		m_freem_list(m);
348 		break;
349 	}
350 #if SKYWALK
351 	case QP_PACKET: {
352 		struct __kern_packet *kpkt;
353 		int pcnt = 0;
354 
355 		kpkt = pkt->pktsched_pkt_kpkt;
356 		if (cnt == 1) {
357 			VERIFY(kpkt->pkt_nextpkt == NULL);
358 		} else {
359 			VERIFY(kpkt->pkt_nextpkt != NULL);
360 		}
361 		pp_free_packet_chain(kpkt, &pcnt);
362 		VERIFY(cnt == (uint32_t)pcnt);
363 		break;
364 	}
365 #endif /* SKYWALK */
366 
367 	default:
368 		VERIFY(0);
369 		/* NOTREACHED */
370 		__builtin_unreachable();
371 	}
372 	pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
373 	pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
374 	pkt->pktsched_plen = 0;
375 	pkt->pktsched_pcnt = 0;
376 }
377 
378 mbuf_svc_class_t
pktsched_get_pkt_svc(pktsched_pkt_t * pkt)379 pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
380 {
381 	mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
382 
383 	switch (pkt->pktsched_ptype) {
384 	case QP_MBUF:
385 		svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
386 		break;
387 
388 #if SKYWALK
389 	case QP_PACKET:
390 		svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
391 		break;
392 #endif /* SKYWALK */
393 
394 	default:
395 		VERIFY(0);
396 		/* NOTREACHED */
397 		__builtin_unreachable();
398 	}
399 
400 	return svc;
401 }
402 
403 void
pktsched_get_pkt_vars(pktsched_pkt_t * pkt,volatile uint32_t ** flags,uint64_t ** timestamp,uint32_t * flowid,uint8_t * flowsrc,uint8_t * proto,uint32_t * comp_gencnt)404 pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
405     uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
406     uint32_t *comp_gencnt)
407 {
408 	switch (pkt->pktsched_ptype) {
409 	case QP_MBUF: {
410 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
411 
412 		if (flags != NULL) {
413 			*flags = &pkth->pkt_flags;
414 		}
415 		if (timestamp != NULL) {
416 			*timestamp = &pkth->pkt_timestamp;
417 		}
418 		if (flowid != NULL) {
419 			*flowid = pkth->pkt_flowid;
420 		}
421 		if (flowsrc != NULL) {
422 			*flowsrc = pkth->pkt_flowsrc;
423 		}
424 		if (proto != NULL) {
425 			/*
426 			 * rdar://100524205 - We want to use the pkt_ext_flags
427 			 * to denote QUIC packets, but AQM is already written in
428 			 * such a way where IPPROTO_QUIC is used to denote QUIC
429 			 * packets.
430 			 */
431 			if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
432 				*proto = IPPROTO_QUIC;
433 			} else {
434 				*proto = pkth->pkt_proto;
435 			}
436 		}
437 		if (comp_gencnt != NULL) {
438 			*comp_gencnt = pkth->comp_gencnt;
439 		}
440 
441 		break;
442 	}
443 
444 #if SKYWALK
445 	case QP_PACKET: {
446 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
447 
448 		if (flags != NULL) {
449 			/* use lower-32 bit for common flags */
450 			*flags = &kp->pkt_pflags32;
451 		}
452 		if (timestamp != NULL) {
453 			*timestamp = &kp->pkt_timestamp;
454 		}
455 		if (flowid != NULL) {
456 			*flowid = kp->pkt_flow_token;
457 		}
458 		if (flowsrc != NULL) {
459 			*flowsrc = (uint8_t)kp->pkt_flowsrc_type;
460 		}
461 		if (proto != NULL) {
462 			*proto = kp->pkt_transport_protocol;
463 		}
464 		if (comp_gencnt != NULL) {
465 			*comp_gencnt = kp->pkt_comp_gencnt;
466 		}
467 
468 		break;
469 	}
470 #endif /* SKYWALK */
471 
472 	default:
473 		VERIFY(0);
474 		/* NOTREACHED */
475 		__builtin_unreachable();
476 	}
477 }
478 
479 struct flowadv_fcentry *
pktsched_alloc_fcentry(pktsched_pkt_t * pkt,struct ifnet * ifp,int how)480 pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
481 {
482 #pragma unused(ifp)
483 	struct flowadv_fcentry *fce = NULL;
484 
485 	switch (pkt->pktsched_ptype) {
486 	case QP_MBUF: {
487 		struct mbuf *m = pkt->pktsched_pkt_mbuf;
488 
489 		fce = flowadv_alloc_entry(how);
490 		if (fce == NULL) {
491 			break;
492 		}
493 
494 		_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
495 		    sizeof(fce->fce_flowid));
496 
497 		fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
498 		fce->fce_flowid = m->m_pkthdr.pkt_flowid;
499 #if SKYWALK
500 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
501 		    sizeof(fce->fce_flowsrc_token));
502 		_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
503 		    sizeof(fce->fce_flowsrc_fidx));
504 
505 		if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
506 			fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
507 			fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
508 			fce->fce_ifp = ifp;
509 		}
510 #endif /* SKYWALK */
511 		break;
512 	}
513 
514 #if SKYWALK
515 	case QP_PACKET: {
516 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
517 
518 		fce = flowadv_alloc_entry(how);
519 		if (fce == NULL) {
520 			break;
521 		}
522 
523 		_CASSERT(sizeof(fce->fce_flowid) ==
524 		    sizeof(kp->pkt_flow_token));
525 		_CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
526 		    sizeof(kp->pkt_flowsrc_fidx));
527 		_CASSERT(sizeof(fce->fce_flowsrc_token) ==
528 		    sizeof(kp->pkt_flowsrc_token));
529 
530 		ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
531 		fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
532 		fce->fce_flowid = kp->pkt_flow_token;
533 		fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
534 		fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
535 		fce->fce_ifp = ifp;
536 		break;
537 	}
538 #endif /* SKYWALK */
539 
540 	default:
541 		VERIFY(0);
542 		/* NOTREACHED */
543 		__builtin_unreachable();
544 	}
545 
546 	return fce;
547 }
548 
549 uint32_t *
pktsched_get_pkt_sfb_vars(pktsched_pkt_t * pkt,uint32_t ** sfb_flags)550 pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
551 {
552 	uint32_t *hashp = NULL;
553 
554 	switch (pkt->pktsched_ptype) {
555 	case QP_MBUF: {
556 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
557 
558 		_CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
559 		_CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
560 		*sfb_flags = &pkth->pkt_mpriv_flags;
561 		hashp = &pkth->pkt_mpriv_hash;
562 		break;
563 	}
564 
565 #if SKYWALK
566 	case QP_PACKET: {
567 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
568 
569 		_CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
570 		_CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
571 		*sfb_flags = &kp->pkt_classq_flags;
572 		hashp = &kp->pkt_classq_hash;
573 		break;
574 	}
575 #endif /* SKYWALK */
576 
577 	default:
578 		VERIFY(0);
579 		/* NOTREACHED */
580 		__builtin_unreachable();
581 	}
582 
583 	return hashp;
584 }
585 
586 static int
pktsched_mbuf_mark_ecn(struct mbuf * m)587 pktsched_mbuf_mark_ecn(struct mbuf* m)
588 {
589 	struct mbuf     *m0;
590 	void            *hdr;
591 	int             af;
592 	uint8_t         ipv;
593 
594 	hdr = m->m_pkthdr.pkt_hdr;
595 	/* verify that hdr is within the mbuf data */
596 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
597 		if (((caddr_t)hdr >= m0->m_data) &&
598 		    ((caddr_t)hdr < m0->m_data + m0->m_len)) {
599 			break;
600 		}
601 	}
602 	if (m0 == NULL) {
603 		return EINVAL;
604 	}
605 	ipv = IP_VHL_V(*(uint8_t *)hdr);
606 	if (ipv == 4) {
607 		af = AF_INET;
608 	} else if (ipv == 6) {
609 		af = AF_INET6;
610 	} else {
611 		af = AF_UNSPEC;
612 	}
613 
614 	switch (af) {
615 	case AF_INET: {
616 		struct ip *ip = hdr;
617 		uint8_t otos;
618 		int sum;
619 
620 		if (((uintptr_t)ip + sizeof(*ip)) >
621 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
622 			return EINVAL;    /* out of bounds */
623 		}
624 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
625 			return EINVAL;    /* not-ECT */
626 		}
627 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
628 			return 0;    /* already marked */
629 		}
630 		/*
631 		 * ecn-capable but not marked,
632 		 * mark CE and update checksum
633 		 */
634 		otos = ip->ip_tos;
635 		ip->ip_tos |= IPTOS_ECN_CE;
636 		/*
637 		 * update checksum (from RFC1624) only if hw
638 		 * checksum is not supported.
639 		 *         HC' = ~(~HC + ~m + m')
640 		 */
641 		if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
642 			sum = ~ntohs(ip->ip_sum) & 0xffff;
643 			sum += (~otos & 0xffff) + ip->ip_tos;
644 			sum = (sum >> 16) + (sum & 0xffff);
645 			sum += (sum >> 16); /* add carry */
646 			ip->ip_sum = htons(~sum & 0xffff);
647 		}
648 		return 0;
649 	}
650 	case AF_INET6: {
651 		struct ip6_hdr *ip6 = hdr;
652 		u_int32_t flowlabel;
653 
654 		if (((uintptr_t)ip6 + sizeof(*ip6)) >
655 		    ((uintptr_t)mbuf_datastart(m0) + mbuf_maxlen(m0))) {
656 			return EINVAL;    /* out of bounds */
657 		}
658 		flowlabel = ntohl(ip6->ip6_flow);
659 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
660 		    (IPTOS_ECN_NOTECT << 20)) {
661 			return EINVAL;    /* not-ECT */
662 		}
663 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
664 		    (IPTOS_ECN_CE << 20)) {
665 			return 0;    /* already marked */
666 		}
667 		/*
668 		 * ecn-capable but not marked,  mark CE
669 		 */
670 		flowlabel |= (IPTOS_ECN_CE << 20);
671 		ip6->ip6_flow = htonl(flowlabel);
672 		return 0;
673 	}
674 	default:
675 		return EPROTONOSUPPORT;
676 	}
677 }
678 
679 static int
pktsched_kpkt_mark_ecn(struct __kern_packet * kpkt)680 pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
681 {
682 	uint8_t ipv = 0, *l3_hdr;
683 
684 	if (__improbable((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
685 		ipv = kpkt->pkt_flow_ip_ver;
686 		l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr;
687 	} else {
688 		uint8_t *pkt_buf;
689 		uint16_t bdlen, bdlim, bdoff;
690 		MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
691 
692 		/* takes care of both IPv4 and IPv6 */
693 		l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
694 		ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
695 		if (ipv == 4) {
696 			ipv = IPVERSION;
697 		} else if (ipv == 6) {
698 			ipv = IPV6_VERSION;
699 		} else {
700 			ipv = 0;
701 		}
702 	}
703 
704 	switch (ipv) {
705 	case IPVERSION: {
706 		uint8_t otos;
707 		int sum;
708 
709 		struct ip *ip = (struct ip *)(void *)l3_hdr;
710 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
711 			return EINVAL;    /* not-ECT */
712 		}
713 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
714 			return 0;    /* already marked */
715 		}
716 		/*
717 		 * ecn-capable but not marked,
718 		 * mark CE and update checksum
719 		 */
720 		otos = ip->ip_tos;
721 		ip->ip_tos |= IPTOS_ECN_CE;
722 
723 		sum = ~ntohs(ip->ip_sum) & 0xffff;
724 		sum += (~otos & 0xffff) + ip->ip_tos;
725 		sum = (sum >> 16) + (sum & 0xffff);
726 		sum += (sum >> 16); /* add carry */
727 		ip->ip_sum = htons(~sum & 0xffff);
728 
729 		return 0;
730 	}
731 	case IPV6_VERSION: {
732 		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
733 		u_int32_t flowlabel;
734 		flowlabel = ntohl(ip6->ip6_flow);
735 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
736 		    (IPTOS_ECN_NOTECT << 20)) {
737 			return EINVAL;    /* not-ECT */
738 		}
739 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
740 		    (IPTOS_ECN_CE << 20)) {
741 			return 0;    /* already marked */
742 		}
743 		/*
744 		 * ecn-capable but not marked, mark CE
745 		 */
746 		flowlabel |= (IPTOS_ECN_CE << 20);
747 		ip6->ip6_flow = htonl(flowlabel);
748 
749 		return 0;
750 	}
751 	default:
752 		return EPROTONOSUPPORT;
753 	}
754 }
755 
756 int
pktsched_mark_ecn(pktsched_pkt_t * pkt)757 pktsched_mark_ecn(pktsched_pkt_t *pkt)
758 {
759 	switch (pkt->pktsched_ptype) {
760 	case QP_MBUF:
761 		return pktsched_mbuf_mark_ecn(pkt->pktsched_pkt_mbuf);
762 	case QP_PACKET:
763 		return pktsched_kpkt_mark_ecn(pkt->pktsched_pkt_kpkt);
764 	default:
765 		VERIFY(0);
766 		/* NOTREACHED */
767 		__builtin_unreachable();
768 	}
769 }
770 
771 boolean_t
pktsched_is_pkt_l4s(pktsched_pkt_t * pkt)772 pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
773 {
774 	switch (pkt->pktsched_ptype) {
775 	case QP_MBUF: {
776 		struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
777 		return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
778 	}
779 	case QP_PACKET: {
780 		struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
781 		return (kp->pkt_pflags & PKT_F_L4S) != 0;
782 	}
783 
784 	default:
785 		VERIFY(0);
786 		/* NOTREACHED */
787 		__builtin_unreachable();
788 	}
789 	return FALSE;
790 }
791