xref: /xnu-11417.140.69/bsd/net/classq/classq_fq_codel.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * The migration of flow queue between the different states is summarised in
31  * the below state diagram. (RFC 8290)
32  *
33  * +-----------------+                +------------------+
34  * |                 |     Empty      |                  |
35  * |     Empty       |<---------------+       Old        +----+
36  * |                 |                |                  |    |
37  * +-------+---------+                +------------------+    |
38  *         |                             ^            ^       |Credits
39  *         |Arrival                      |            |       |Exhausted
40  *         v                             |            |       |
41  * +-----------------+                   |            |       |
42  * |                 |      Empty or     |            |       |
43  * |      New        +-------------------+            +-------+
44  * |                 | Credits Exhausted
45  * +-----------------+
46  *
47  * In this implementation of FQ-CODEL, flow queue is a dynamically allocated
48  * object. An active flow queue goes through the above cycle of state
49  * transitions very often. To avoid the cost of frequent flow queue object
50  * allocation/free, this implementation retains the flow queue object in
51  * [Empty] state on an Empty flow queue list with an active reference in flow
52  * queue hash table. The flow queue objects on the Empty flow queue list have
53  * an associated age and are purged accordingly.
54  */
55 
56 #include <sys/cdefs.h>
57 #include <sys/param.h>
58 #include <sys/mbuf.h>
59 #include <sys/socket.h>
60 #include <sys/sockio.h>
61 #include <sys/systm.h>
62 #include <sys/syslog.h>
63 #include <sys/proc.h>
64 #include <sys/errno.h>
65 #include <sys/kernel.h>
66 #include <sys/kauth.h>
67 #include <sys/sdt.h>
68 #include <kern/zalloc.h>
69 #include <netinet/in.h>
70 
71 #include <net/classq/classq.h>
72 #include <net/classq/if_classq.h>
73 #include <net/pktsched/pktsched.h>
74 #include <net/pktsched/pktsched_fq_codel.h>
75 #include <net/classq/classq_fq_codel.h>
76 #include <net/droptap.h>
77 
78 #include <netinet/tcp_var.h>
79 
80 #define FQ_ZONE_MAX     (32 * 1024)     /* across all interfaces */
81 
82 #define DTYPE_NODROP    0       /* no drop */
83 #define DTYPE_FORCED    1       /* a "forced" drop */
84 #define DTYPE_EARLY     2       /* an "unforced" (early) drop */
85 
86 static uint32_t pkt_compressor = 1;
87 static uint64_t l4s_ce_threshold = 0; /* in usec */
88 static uint32_t l4s_local_ce_report = 0;
89 static uint64_t pkt_pacing_leeway = 0; /* in usec */
90 static uint64_t max_pkt_pacing_interval = 3 * NSEC_PER_SEC;
91 static uint64_t l4s_min_delay_threshold = 20 * NSEC_PER_MSEC; /* 20 ms */
92 #if (DEBUG || DEVELOPMENT)
93 SYSCTL_NODE(_net_classq, OID_AUTO, flow_q, CTLFLAG_RW | CTLFLAG_LOCKED,
94     0, "FQ-CODEL parameters");
95 
96 SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, pkt_compressor,
97     CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_compressor, 0, "enable pkt compression");
98 
99 SYSCTL_QUAD(_net_classq, OID_AUTO, l4s_ce_threshold,
100     CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_ce_threshold,
101     "L4S CE threshold");
102 
103 SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, l4s_local_ce_report,
104     CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_local_ce_report, 0,
105     "enable L4S local CE report");
106 
107 SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, pkt_pacing_leeway,
108     CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_pacing_leeway, "packet pacing leeway");
109 
110 SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, max_pkt_pacing_interval,
111     CTLFLAG_RW | CTLFLAG_LOCKED, &max_pkt_pacing_interval, "max packet pacing interval");
112 
113 SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, l4s_min_delay_threshold,
114     CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_min_delay_threshold, "l4s min delay threshold");
115 #endif /* (DEBUG || DEVELOPMENT) */
116 
117 void
fq_codel_init(void)118 fq_codel_init(void)
119 {
120 	_CASSERT(AQM_KTRACE_AON_FLOW_HIGH_DELAY == 0x8300004);
121 	_CASSERT(AQM_KTRACE_AON_THROTTLE == 0x8300008);
122 	_CASSERT(AQM_KTRACE_AON_FLOW_OVERWHELMING == 0x830000c);
123 	_CASSERT(AQM_KTRACE_AON_FLOW_DQ_STALL == 0x8300010);
124 
125 	_CASSERT(AQM_KTRACE_STATS_FLOW_ENQUEUE == 0x8310004);
126 	_CASSERT(AQM_KTRACE_STATS_FLOW_DEQUEUE == 0x8310008);
127 	_CASSERT(AQM_KTRACE_STATS_FLOW_CTL == 0x831000c);
128 	_CASSERT(AQM_KTRACE_STATS_FLOW_ALLOC == 0x8310010);
129 	_CASSERT(AQM_KTRACE_STATS_FLOW_DESTROY == 0x8310014);
130 	_CASSERT(AQM_KTRACE_STATS_FLOW_REPORT_CE == 0x8310018);
131 	_CASSERT(AQM_KTRACE_STATS_GET_QLEN == 0x831001c);
132 	_CASSERT(AQM_KTRACE_TX_NOT_READY == 0x8310020);
133 	_CASSERT(AQM_KTRACE_TX_PACEMAKER == 0x8310024);
134 }
135 
136 fq_t *
fq_alloc(classq_pkt_type_t ptype)137 fq_alloc(classq_pkt_type_t ptype)
138 {
139 	fq_t *fq = NULL;
140 
141 	fq = kalloc_type(fq_t, Z_WAITOK_ZERO);
142 	if (ptype == QP_MBUF) {
143 		MBUFQ_INIT(&fq->fq_mbufq);
144 	}
145 #if SKYWALK
146 	else {
147 		VERIFY(ptype == QP_PACKET);
148 		KPKTQ_INIT(&fq->fq_kpktq);
149 	}
150 #endif /* SKYWALK */
151 	CLASSQ_PKT_INIT(&fq->fq_dq_head);
152 	CLASSQ_PKT_INIT(&fq->fq_dq_tail);
153 	fq->fq_in_dqlist = false;
154 
155 	return fq;
156 }
157 
158 void
fq_destroy(fq_t * fq,classq_pkt_type_t ptype)159 fq_destroy(fq_t *fq, classq_pkt_type_t ptype)
160 {
161 	VERIFY(!fq->fq_in_dqlist);
162 	VERIFY(fq_empty(fq, ptype));
163 	VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW |
164 	    FQF_EMPTY_FLOW)));
165 	VERIFY(fq->fq_bytes == 0);
166 	kfree_type(fq_t, fq);
167 }
168 
169 static inline void
fq_detect_dequeue_stall(fq_if_t * fqs,fq_t * flowq,fq_if_classq_t * fq_cl,u_int64_t * now)170 fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
171     u_int64_t *now)
172 {
173 	u_int64_t maxgetqtime, update_interval;
174 	if (FQ_IS_DELAY_HIGH(flowq) || flowq->fq_getqtime == 0 ||
175 	    fq_empty(flowq, fqs->fqs_ptype) ||
176 	    flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) {
177 		return;
178 	}
179 
180 	update_interval = FQ_UPDATE_INTERVAL(flowq);
181 	maxgetqtime = flowq->fq_getqtime + update_interval;
182 	if ((*now) > maxgetqtime) {
183 		/*
184 		 * there was no dequeue in an update interval worth of
185 		 * time. It means that the queue is stalled.
186 		 */
187 		FQ_SET_DELAY_HIGH(flowq);
188 		fq_cl->fcl_stat.fcl_dequeue_stall++;
189 		os_log_error(OS_LOG_DEFAULT, "%s:num: %d, "
190 		    "scidx: %d, flow: 0x%x, iface: %s grp: %hhu", __func__,
191 		    fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index,
192 		    flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp),
193 		    FQ_GROUP(flowq)->fqg_index);
194 		KDBG(AQM_KTRACE_AON_FLOW_DQ_STALL, flowq->fq_flowhash,
195 		    AQM_KTRACE_FQ_GRP_SC_IDX(flowq), flowq->fq_bytes,
196 		    (*now) - flowq->fq_getqtime);
197 	}
198 }
199 
200 void
fq_head_drop(fq_if_t * fqs,fq_t * fq)201 fq_head_drop(fq_if_t *fqs, fq_t *fq)
202 {
203 	pktsched_pkt_t pkt;
204 	volatile uint32_t *__single pkt_flags;
205 	uint64_t *__single pkt_timestamp;
206 	struct ifclassq *ifq = fqs->fqs_ifq;
207 
208 	_PKTSCHED_PKT_INIT(&pkt);
209 	fq_getq_flow_internal(fqs, fq, &pkt);
210 	if (pkt.pktsched_pkt_mbuf == NULL) {
211 		return;
212 	}
213 
214 	pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
215 	    NULL, NULL, NULL);
216 
217 	*pkt_timestamp = 0;
218 	switch (pkt.pktsched_ptype) {
219 	case QP_MBUF:
220 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
221 		break;
222 #if SKYWALK
223 	case QP_PACKET:
224 		/* sanity check */
225 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
226 		break;
227 #endif /* SKYWALK */
228 	default:
229 		VERIFY(0);
230 		/* NOTREACHED */
231 		__builtin_unreachable();
232 	}
233 
234 	IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt));
235 	IFCQ_CONVERT_LOCK(ifq);
236 	pktsched_free_pkt(&pkt);
237 }
238 
239 
240 static int
fq_compressor(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl,pktsched_pkt_t * pkt)241 fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl,
242     pktsched_pkt_t *pkt)
243 {
244 	classq_pkt_type_t ptype = fqs->fqs_ptype;
245 	uint32_t comp_gencnt = 0;
246 	uint64_t *__single pkt_timestamp;
247 	uint64_t old_timestamp = 0;
248 	uint32_t old_pktlen = 0;
249 	struct ifclassq *ifq = fqs->fqs_ifq;
250 
251 	if (__improbable(pkt_compressor == 0)) {
252 		return 0;
253 	}
254 
255 	pktsched_get_pkt_vars(pkt, NULL, &pkt_timestamp, NULL, NULL, NULL,
256 	    &comp_gencnt, NULL);
257 
258 	if (comp_gencnt == 0) {
259 		return 0;
260 	}
261 
262 	fq_cl->fcl_stat.fcl_pkts_compressible++;
263 
264 	if (fq_empty(fq, fqs->fqs_ptype)) {
265 		return 0;
266 	}
267 
268 	if (ptype == QP_MBUF) {
269 		struct mbuf *m = MBUFQ_LAST(&fq->fq_mbufq);
270 
271 		if (comp_gencnt != m->m_pkthdr.comp_gencnt) {
272 			return 0;
273 		}
274 
275 		/* If we got until here, we should merge/replace the segment */
276 		MBUFQ_REMOVE(&fq->fq_mbufq, m);
277 		old_pktlen = m_pktlen(m);
278 		old_timestamp = m->m_pkthdr.pkt_timestamp;
279 
280 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
281 
282 		if (__improbable(droptap_verbose > 0)) {
283 			droptap_output_mbuf(m, DROP_REASON_AQM_COMPRESSED, NULL, 0, 0,
284 			    fqs->fqs_ifq->ifcq_ifp);
285 		}
286 
287 		m_freem(m);
288 	}
289 #if SKYWALK
290 	else {
291 		struct __kern_packet *kpkt = KPKTQ_LAST(&fq->fq_kpktq);
292 
293 		if (comp_gencnt != kpkt->pkt_comp_gencnt) {
294 			return 0;
295 		}
296 
297 		/* If we got until here, we should merge/replace the segment */
298 		KPKTQ_REMOVE(&fq->fq_kpktq, kpkt);
299 		old_pktlen = kpkt->pkt_length;
300 		old_timestamp = kpkt->pkt_timestamp;
301 
302 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
303 
304 		if (__improbable(droptap_verbose > 0)) {
305 			droptap_output_packet(SK_PKT2PH(kpkt), DROP_REASON_AQM_COMPRESSED, NULL, 0, 0,
306 			    fqs->fqs_ifq->ifcq_ifp, 0, NULL, -1, NULL, 0, 0);
307 		}
308 
309 		struct kern_pbufpool *pp =
310 		    __DECONST(struct kern_pbufpool *, ((struct __kern_quantum *)kpkt)->qum_pp);
311 		pp_free_packet(pp, (uint64_t)kpkt);
312 	}
313 #endif /* SKYWALK */
314 
315 	fq->fq_bytes -= old_pktlen;
316 	fq_cl->fcl_stat.fcl_byte_cnt -= old_pktlen;
317 	fq_cl->fcl_stat.fcl_pkt_cnt--;
318 	IFCQ_DEC_LEN(ifq);
319 	IFCQ_DEC_BYTES(ifq, old_pktlen);
320 
321 	FQ_GRP_DEC_LEN(fq);
322 	FQ_GRP_DEC_BYTES(fq, old_pktlen);
323 
324 	*pkt_timestamp = old_timestamp;
325 
326 	return CLASSQEQ_COMPRESSED;
327 }
328 
329 int
fq_addq(fq_if_t * fqs,fq_if_group_t * fq_grp,pktsched_pkt_t * pkt,fq_if_classq_t * fq_cl)330 fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt,
331     fq_if_classq_t *fq_cl)
332 {
333 	int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS;
334 	u_int64_t now;
335 	fq_t *fq = NULL;
336 	uint64_t *__single pkt_timestamp;
337 	volatile uint32_t *__single pkt_flags;
338 	uint32_t pkt_flowid, cnt;
339 	uint8_t pkt_proto, pkt_flowsrc;
340 	fq_tfc_type_t tfc_type = FQ_TFC_C;
341 
342 	cnt = pkt->pktsched_pcnt;
343 	pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
344 	    &pkt_flowsrc, &pkt_proto, NULL, NULL);
345 
346 	/*
347 	 * XXX Not walking the chain to set this flag on every packet.
348 	 * This flag is only used for debugging. Nothing is affected if it's
349 	 * not set.
350 	 */
351 	switch (pkt->pktsched_ptype) {
352 	case QP_MBUF:
353 		/* See comments in <rdar://problem/14040693> */
354 		VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
355 		*pkt_flags |= PKTF_PRIV_GUARDED;
356 		break;
357 #if SKYWALK
358 	case QP_PACKET:
359 		/* sanity check */
360 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
361 		break;
362 #endif /* SKYWALK */
363 	default:
364 		VERIFY(0);
365 		/* NOTREACHED */
366 		__builtin_unreachable();
367 	}
368 
369 	if (ifclassq_enable_l4s) {
370 		tfc_type = pktsched_is_pkt_l4s(pkt) ? FQ_TFC_L4S : FQ_TFC_C;
371 	}
372 
373 	/*
374 	 * Timestamps for every packet must be set prior to entering this path.
375 	 */
376 	now = *pkt_timestamp;
377 	ASSERT(now > 0);
378 
379 	/* find the flowq for this packet */
380 	fq = fq_if_hash_pkt(fqs, fq_grp, pkt_flowid, pktsched_get_pkt_svc(pkt),
381 	    now, true, tfc_type);
382 	if (__improbable(fq == NULL)) {
383 		DTRACE_IP1(memfail__drop, fq_if_t *, fqs);
384 		/* drop the packet if we could not allocate a flow queue */
385 		fq_cl->fcl_stat.fcl_drop_memfailure += cnt;
386 		return CLASSQEQ_DROP;
387 	}
388 	VERIFY(fq->fq_group == fq_grp);
389 	VERIFY(fqs->fqs_ptype == pkt->pktsched_ptype);
390 
391 	KDBG(AQM_KTRACE_STATS_FLOW_ENQUEUE, fq->fq_flowhash,
392 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
393 	    fq->fq_bytes, pktsched_get_pkt_len(pkt));
394 
395 	fq_detect_dequeue_stall(fqs, fq, fq_cl, &now);
396 
397 	/*
398 	 * Skip the dropping part if it's L4S. Flow control or ECN marking decision
399 	 * will be made at dequeue time.
400 	 */
401 	if (ifclassq_enable_l4s && tfc_type == FQ_TFC_L4S) {
402 		fq_cl->fcl_stat.fcl_l4s_pkts += cnt;
403 		droptype = DTYPE_NODROP;
404 	}
405 
406 	if (__improbable(FQ_IS_DELAY_HIGH(fq) || FQ_IS_OVERWHELMING(fq))) {
407 		if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) &&
408 		    (*pkt_flags & PKTF_FLOW_ADV)) {
409 			fc_adv = 1;
410 			/*
411 			 * If the flow is suspended or it is not
412 			 * TCP/QUIC, drop the chain.
413 			 */
414 			if ((pkt_proto != IPPROTO_TCP) &&
415 			    (pkt_proto != IPPROTO_QUIC)) {
416 				droptype = DTYPE_EARLY;
417 				fq_cl->fcl_stat.fcl_drop_early += cnt;
418 				IFCQ_DROP_ADD(fqs->fqs_ifq, cnt, pktsched_get_pkt_len(pkt));
419 			}
420 			DTRACE_IP6(flow__adv, fq_if_t *, fqs,
421 			    fq_if_classq_t *, fq_cl, fq_t *, fq,
422 			    int, droptype, pktsched_pkt_t *, pkt,
423 			    uint32_t, cnt);
424 		} else {
425 			/*
426 			 * Need to drop packets to make room for the new
427 			 * ones. Try to drop from the head of the queue
428 			 * instead of the latest packets.
429 			 */
430 			if (!fq_empty(fq, fqs->fqs_ptype)) {
431 				uint32_t i;
432 
433 				for (i = 0; i < cnt; i++) {
434 					fq_head_drop(fqs, fq);
435 				}
436 				droptype = DTYPE_NODROP;
437 			} else {
438 				droptype = DTYPE_EARLY;
439 			}
440 			fq_cl->fcl_stat.fcl_drop_early += cnt;
441 
442 			DTRACE_IP6(no__flow__adv, fq_if_t *, fqs,
443 			    fq_if_classq_t *, fq_cl, fq_t *, fq,
444 			    int, droptype, pktsched_pkt_t *, pkt,
445 			    uint32_t, cnt);
446 		}
447 	}
448 
449 	/* Set the return code correctly */
450 	if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) {
451 		if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) {
452 			fq->fq_flags |= FQF_FLOWCTL_ON;
453 			/* deliver flow control advisory error */
454 			if (droptype == DTYPE_NODROP) {
455 				ret = CLASSQEQ_SUCCESS_FC;
456 			} else {
457 				/* dropped due to flow control */
458 				ret = CLASSQEQ_DROP_FC;
459 			}
460 		} else {
461 			/*
462 			 * if we could not flow control the flow, it is
463 			 * better to drop
464 			 */
465 			droptype = DTYPE_FORCED;
466 			ret = CLASSQEQ_DROP_FC;
467 			fq_cl->fcl_stat.fcl_flow_control_fail++;
468 		}
469 		DTRACE_IP3(fc__ret, fq_if_t *, fqs, int, droptype, int, ret);
470 	}
471 
472 	/*
473 	 * If the queue length hits the queue limit, drop a chain with the
474 	 * same number of packets from the front of the queue for a flow with
475 	 * maximum number of bytes. This will penalize heavy and unresponsive
476 	 * flows. It will also avoid a tail drop.
477 	 */
478 	if (__improbable(droptype == DTYPE_NODROP &&
479 	    fq_if_at_drop_limit(fqs))) {
480 		uint32_t i;
481 
482 		if (fqs->fqs_large_flow == fq) {
483 			/*
484 			 * Drop from the head of the current fq. Since a
485 			 * new packet will be added to the tail, it is ok
486 			 * to leave fq in place.
487 			 */
488 			DTRACE_IP5(large__flow, fq_if_t *, fqs,
489 			    fq_if_classq_t *, fq_cl, fq_t *, fq,
490 			    pktsched_pkt_t *, pkt, uint32_t, cnt);
491 
492 			for (i = 0; i < cnt; i++) {
493 				fq_head_drop(fqs, fq);
494 			}
495 			fq_cl->fcl_stat.fcl_drop_overflow += cnt;
496 
497 			/*
498 			 * TCP and QUIC will react to the loss of those head dropped pkts
499 			 * and adjust send rate.
500 			 */
501 			if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) &&
502 			    (*pkt_flags & PKTF_FLOW_ADV) &&
503 			    (pkt_proto != IPPROTO_TCP) &&
504 			    (pkt_proto != IPPROTO_QUIC)) {
505 				if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) {
506 					fq->fq_flags |= FQF_FLOWCTL_ON;
507 					FQ_SET_OVERWHELMING(fq);
508 					fq_cl->fcl_stat.fcl_overwhelming++;
509 					/* deliver flow control advisory error */
510 					ret = CLASSQEQ_SUCCESS_FC;
511 				}
512 			}
513 		} else {
514 			if (fqs->fqs_large_flow == NULL) {
515 				droptype = DTYPE_FORCED;
516 				fq_cl->fcl_stat.fcl_drop_overflow += cnt;
517 				ret = CLASSQEQ_DROP;
518 
519 				DTRACE_IP5(no__large__flow, fq_if_t *, fqs,
520 				    fq_if_classq_t *, fq_cl, fq_t *, fq,
521 				    pktsched_pkt_t *, pkt, uint32_t, cnt);
522 
523 				/*
524 				 * if this fq was freshly created and there
525 				 * is nothing to enqueue, move it to empty list
526 				 */
527 				if (fq_empty(fq, fqs->fqs_ptype) &&
528 				    !(fq->fq_flags & (FQF_NEW_FLOW |
529 				    FQF_OLD_FLOW))) {
530 					fq_if_move_to_empty_flow(fqs, fq_cl,
531 					    fq, now);
532 					fq = NULL;
533 				}
534 			} else {
535 				DTRACE_IP5(different__large__flow,
536 				    fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
537 				    fq_t *, fq, pktsched_pkt_t *, pkt,
538 				    uint32_t, cnt);
539 
540 				for (i = 0; i < cnt; i++) {
541 					fq_if_drop_packet(fqs, now);
542 				}
543 			}
544 		}
545 	}
546 
547 	fq_cl->fcl_flags &= ~FCL_PACED;
548 
549 	if (__probable(droptype == DTYPE_NODROP)) {
550 		uint32_t chain_len = pktsched_get_pkt_len(pkt);
551 		int ret_compress = 0;
552 
553 		/*
554 		 * We do not compress if we are enqueuing a chain.
555 		 * Traversing the chain to look for acks would defeat the
556 		 * purpose of batch enqueueing.
557 		 */
558 		if (cnt == 1) {
559 			ret_compress = fq_compressor(fqs, fq, fq_cl, pkt);
560 			if (ret_compress == CLASSQEQ_COMPRESSED) {
561 				fq_cl->fcl_stat.fcl_pkts_compressed++;
562 			}
563 		}
564 		DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl,
565 		    fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt);
566 		fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt,
567 		    pkt->pktsched_ptype);
568 
569 		fq->fq_bytes += chain_len;
570 		fq_cl->fcl_stat.fcl_byte_cnt += chain_len;
571 		fq_cl->fcl_stat.fcl_pkt_cnt += cnt;
572 
573 		/*
574 		 * check if this queue will qualify to be the next
575 		 * victim queue
576 		 */
577 		fq_if_is_flow_heavy(fqs, fq);
578 	} else {
579 		DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret);
580 		return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP;
581 	}
582 
583 	/*
584 	 * If the queue is not currently active, add it to the end of new
585 	 * flows list for that service class.
586 	 */
587 	if ((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == 0) {
588 		VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL);
589 		STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink);
590 		fq->fq_flags |= FQF_NEW_FLOW;
591 
592 		fq_cl->fcl_stat.fcl_newflows_cnt++;
593 
594 		fq->fq_deficit = fq_cl->fcl_quantum;
595 	}
596 	return ret;
597 }
598 
599 void
fq_getq_flow_internal(fq_if_t * fqs,fq_t * fq,pktsched_pkt_t * pkt)600 fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
601 {
602 	classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p);
603 	uint32_t plen;
604 	fq_if_classq_t *fq_cl;
605 	struct ifclassq *ifq = fqs->fqs_ifq;
606 
607 	fq_dequeue(fq, &p, fqs->fqs_ptype);
608 	if (p.cp_ptype == QP_INVALID) {
609 		VERIFY(p.cp_mbuf == NULL);
610 		return;
611 	}
612 
613 	fq->fq_next_tx_time = FQ_INVALID_TX_TS;
614 
615 	pktsched_pkt_encap(pkt, &p);
616 	plen = pktsched_get_pkt_len(pkt);
617 
618 	VERIFY(fq->fq_bytes >= plen);
619 	fq->fq_bytes -= plen;
620 
621 	fq_cl = &FQ_CLASSQ(fq);
622 	fq_cl->fcl_stat.fcl_byte_cnt -= plen;
623 	fq_cl->fcl_stat.fcl_pkt_cnt--;
624 	fq_cl->fcl_flags &= ~FCL_PACED;
625 
626 	IFCQ_DEC_LEN(ifq);
627 	IFCQ_DEC_BYTES(ifq, plen);
628 
629 	FQ_GRP_DEC_LEN(fq);
630 	FQ_GRP_DEC_BYTES(fq, plen);
631 
632 	/* Reset getqtime so that we don't count idle times */
633 	if (fq_empty(fq, fqs->fqs_ptype)) {
634 		fq->fq_getqtime = 0;
635 	}
636 }
637 
638 /*
639  * fq_get_next_tx_time returns FQ_INVALID_TX_TS when there is no tx time in fq
640  */
641 static uint64_t
fq_get_next_tx_time(fq_if_t * fqs,fq_t * fq)642 fq_get_next_tx_time(fq_if_t *fqs, fq_t *fq)
643 {
644 	uint64_t tx_time = FQ_INVALID_TX_TS;
645 
646 	/*
647 	 * Check the cached value in fq
648 	 */
649 	if (fq->fq_next_tx_time != FQ_INVALID_TX_TS) {
650 		return fq->fq_next_tx_time;
651 	}
652 
653 	switch (fqs->fqs_ptype) {
654 	case QP_MBUF: {
655 		struct mbuf *m;
656 		if ((m = MBUFQ_FIRST(&fq->fq_mbufq)) != NULL) {
657 			struct m_tag *tag;
658 			tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
659 			    KERNEL_TAG_TYPE_AQM);
660 			if (tag != NULL) {
661 				tx_time = *(uint64_t *)tag->m_tag_data;
662 			}
663 		}
664 		break;
665 	}
666 	case QP_PACKET: {
667 		struct __kern_packet *p = KPKTQ_FIRST(&fq->fq_kpktq);
668 		if (__probable(p != NULL)) {
669 			tx_time = __packet_get_tx_timestamp(SK_PKT2PH(p));
670 		}
671 		break;
672 	}
673 	default:
674 		VERIFY(0);
675 		/* NOTREACHED */
676 		__builtin_unreachable();
677 	}
678 
679 	/*
680 	 * Cache the tx time in fq. The cache will be clear after dequeue or drop
681 	 * from the fq.
682 	 */
683 	fq->fq_next_tx_time = tx_time;
684 
685 	return tx_time;
686 }
687 
688 /*
689  * fq_tx_time_ready returns true if the fq is empty so that it doesn't
690  * affect caller logics that handles empty flow.
691  */
692 boolean_t
fq_tx_time_ready(fq_if_t * fqs,fq_t * fq,uint64_t now,uint64_t * ready_time)693 fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time)
694 {
695 	uint64_t pkt_tx_time;
696 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
697 
698 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s || fq->fq_tfc_type != FQ_TFC_L4S) {
699 		return TRUE;
700 	}
701 
702 	pkt_tx_time = fq_get_next_tx_time(fqs, fq);
703 	if (ready_time != NULL) {
704 		*ready_time = pkt_tx_time;
705 	}
706 
707 	if (pkt_tx_time <= now + pkt_pacing_leeway ||
708 	    pkt_tx_time == FQ_INVALID_TX_TS) {
709 		return TRUE;
710 	}
711 
712 	/*
713 	 * Ignore the tx time if it's scheduled too far in the future
714 	 */
715 	if (pkt_tx_time > max_pkt_pacing_interval + now) {
716 		fq_cl->fcl_stat.fcl_ignore_tx_time++;
717 		return TRUE;
718 	}
719 
720 	ASSERT(pkt_tx_time != FQ_INVALID_TX_TS);
721 	KDBG(AQM_KTRACE_TX_NOT_READY, fq->fq_flowhash,
722 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), now, pkt_tx_time);
723 	return FALSE;
724 }
725 
726 void
fq_getq_flow(fq_if_t * fqs,fq_t * fq,pktsched_pkt_t * pkt,uint64_t now)727 fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now)
728 {
729 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
730 	int64_t qdelay = 0;
731 	volatile uint32_t *__single pkt_flags;
732 	uint64_t *__single pkt_timestamp, pkt_tx_time = 0, pacing_delay = 0;
733 	uint64_t fq_min_delay_threshold = FQ_TARGET_DELAY(fq);
734 	uint8_t pkt_flowsrc;
735 	boolean_t l4s_pkt;
736 
737 	fq_getq_flow_internal(fqs, fq, pkt);
738 	if (pkt->pktsched_ptype == QP_INVALID) {
739 		VERIFY(pkt->pktsched_pkt_mbuf == NULL);
740 		return;
741 	}
742 
743 	pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, &pkt_flowsrc,
744 	    NULL, NULL, &pkt_tx_time);
745 	l4s_pkt = pktsched_is_pkt_l4s(pkt);
746 	if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
747 		if (pkt_tx_time > *pkt_timestamp) {
748 			pacing_delay = pkt_tx_time - *pkt_timestamp;
749 			fq_cl->fcl_stat.fcl_paced_pkts++;
750 			DTRACE_SKYWALK3(aqm__pacing__delta, uint64_t, now - pkt_tx_time,
751 			    fq_if_t *, fqs, fq_t *, fq);
752 		}
753 #if (DEVELOPMENT || DEBUG)
754 		else if (pkt_tx_time != 0) {
755 			DTRACE_SKYWALK5(aqm__miss__pacing__delay, uint64_t, *pkt_timestamp,
756 			    uint64_t, pkt_tx_time, uint64_t, now, fq_if_t *,
757 			    fqs, fq_t *, fq);
758 		}
759 #endif // (DEVELOPMENT || DEBUG)
760 	}
761 
762 	/* this will compute qdelay in nanoseconds */
763 	if (now > *pkt_timestamp) {
764 		qdelay = now - *pkt_timestamp;
765 	}
766 
767 	/* Update min/max/avg qdelay for the respective class */
768 	if (fq_cl->fcl_stat.fcl_min_qdelay == 0 ||
769 	    (qdelay > 0 && (u_int64_t)qdelay < fq_cl->fcl_stat.fcl_min_qdelay)) {
770 		fq_cl->fcl_stat.fcl_min_qdelay = qdelay;
771 	}
772 
773 	if (fq_cl->fcl_stat.fcl_max_qdelay == 0 ||
774 	    (qdelay > 0 && (u_int64_t)qdelay > fq_cl->fcl_stat.fcl_max_qdelay)) {
775 		fq_cl->fcl_stat.fcl_max_qdelay = qdelay;
776 	}
777 
778 	uint64_t num_dequeues = fq_cl->fcl_stat.fcl_dequeue;
779 
780 	if (num_dequeues == 0) {
781 		fq_cl->fcl_stat.fcl_avg_qdelay = qdelay;
782 	} else if (qdelay > 0) {
783 		uint64_t res = 0;
784 		if (os_add_overflow(num_dequeues, 1, &res)) {
785 			/* Reset the dequeue num and dequeue bytes */
786 			fq_cl->fcl_stat.fcl_dequeue = num_dequeues = 0;
787 			fq_cl->fcl_stat.fcl_dequeue_bytes = 0;
788 			fq_cl->fcl_stat.fcl_avg_qdelay = qdelay;
789 			os_log_info(OS_LOG_DEFAULT, "%s: dequeue num overflow, "
790 			    "flow: 0x%x, iface: %s", __func__, fq->fq_flowhash,
791 			    if_name(fqs->fqs_ifq->ifcq_ifp));
792 		} else {
793 			uint64_t product = 0;
794 			if (os_mul_overflow(fq_cl->fcl_stat.fcl_avg_qdelay,
795 			    num_dequeues, &product) || os_add_overflow(product, qdelay, &res)) {
796 				fq_cl->fcl_stat.fcl_avg_qdelay = qdelay;
797 			} else {
798 				fq_cl->fcl_stat.fcl_avg_qdelay = res /
799 				    (num_dequeues + 1);
800 			}
801 		}
802 	}
803 
804 	fq->fq_pkts_since_last_report++;
805 	if (ifclassq_enable_l4s && l4s_pkt) {
806 		/*
807 		 * A safe guard to make sure that L4S is not going to build a huge
808 		 * queue if we encounter unexpected problems (for eg., if ACKs don't
809 		 * arrive in timely manner due to congestion in reverse path).
810 		 */
811 		fq_min_delay_threshold = l4s_min_delay_threshold;
812 
813 		if ((l4s_ce_threshold != 0 && qdelay > l4s_ce_threshold + pacing_delay) ||
814 		    (l4s_ce_threshold == 0 && qdelay > FQ_TARGET_DELAY(fq) + pacing_delay)) {
815 			DTRACE_SKYWALK4(aqm__mark__ce, uint64_t, qdelay, uint64_t, pacing_delay,
816 			    fq_if_t *, fqs, fq_t *, fq);
817 			KDBG(AQM_KTRACE_STATS_FLOW_REPORT_CE, fq->fq_flowhash,
818 			    AQM_KTRACE_FQ_GRP_SC_IDX(fq), qdelay, pacing_delay);
819 			/*
820 			 * The packet buffer that pktsched_mark_ecn writes to can be pageable.
821 			 * Since it is not safe to write to pageable memory while preemption
822 			 * is disabled, convert the spin lock into mutex.
823 			 */
824 			IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
825 			if (__improbable(l4s_local_ce_report != 0) &&
826 			    (*pkt_flags & PKTF_FLOW_ADV) != 0 &&
827 			    fq_if_report_ce(fqs, pkt, 1, fq->fq_pkts_since_last_report)) {
828 				fq->fq_pkts_since_last_report = 0;
829 				fq_cl->fcl_stat.fcl_ce_reported++;
830 			} else if (pktsched_mark_ecn(pkt) == 0) {
831 				fq_cl->fcl_stat.fcl_ce_marked++;
832 			} else {
833 				fq_cl->fcl_stat.fcl_ce_mark_failures++;
834 			}
835 		}
836 	}
837 
838 	ASSERT(pacing_delay <= INT64_MAX);
839 	qdelay = MAX(0, qdelay - (int64_t)pacing_delay);
840 	if (fq->fq_min_qdelay == 0 ||
841 	    (u_int64_t)qdelay < fq->fq_min_qdelay) {
842 		fq->fq_min_qdelay = qdelay;
843 	}
844 
845 	if (now >= fq->fq_updatetime) {
846 		if (fq->fq_min_qdelay > fq_min_delay_threshold) {
847 			if (!FQ_IS_DELAY_HIGH(fq)) {
848 				FQ_SET_DELAY_HIGH(fq);
849 			}
850 		} else {
851 			FQ_CLEAR_DELAY_HIGH(fq);
852 		}
853 		/* Reset measured queue delay and update time */
854 		fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
855 		fq->fq_min_qdelay = 0;
856 	}
857 
858 	if (fqs->fqs_large_flow != fq || !fq_if_almost_at_drop_limit(fqs)) {
859 		FQ_CLEAR_OVERWHELMING(fq);
860 	}
861 	if (!FQ_IS_DELAY_HIGH(fq) || fq_empty(fq, fqs->fqs_ptype)) {
862 		FQ_CLEAR_DELAY_HIGH(fq);
863 	}
864 
865 	if ((fq->fq_flags & FQF_FLOWCTL_ON) &&
866 	    !FQ_IS_DELAY_HIGH(fq) && !FQ_IS_OVERWHELMING(fq)) {
867 		fq_if_flow_feedback(fqs, fq, fq_cl);
868 	}
869 
870 	if (fq_empty(fq, fqs->fqs_ptype)) {
871 		/* Reset getqtime so that we don't count idle times */
872 		fq->fq_getqtime = 0;
873 	} else {
874 		fq->fq_getqtime = now;
875 	}
876 	fq_if_is_flow_heavy(fqs, fq);
877 
878 	*pkt_timestamp = 0;
879 	switch (pkt->pktsched_ptype) {
880 	case QP_MBUF:
881 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
882 		break;
883 #if SKYWALK
884 	case QP_PACKET:
885 		/* sanity check */
886 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
887 		break;
888 #endif /* SKYWALK */
889 	default:
890 		VERIFY(0);
891 		/* NOTREACHED */
892 		__builtin_unreachable();
893 	}
894 }
895