xref: /xnu-10063.121.3/bsd/net/pktsched/pktsched_fq_codel.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/classq/classq.h>
36 #include <net/classq/classq_fq_codel.h>
37 #include <net/pktsched/pktsched_fq_codel.h>
38 #include <os/log.h>
39 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
40 #include <mach/thread_act.h>
41 #include <kern/thread.h>
42 #include <kern/sched_prim.h>
43 
44 #define FQ_CODEL_DEFAULT_QUANTUM 1500
45 
46 #define FQ_CODEL_QUANTUM_BK_SYS(_q)    (_q)
47 #define FQ_CODEL_QUANTUM_BK(_q)        (_q)
48 #define FQ_CODEL_QUANTUM_BE(_q)        (_q)
49 #define FQ_CODEL_QUANTUM_RD(_q)        (_q)
50 #define FQ_CODEL_QUANTUM_OAM(_q)       (_q)
51 #define FQ_CODEL_QUANTUM_AV(_q)        (_q * 2)
52 #define FQ_CODEL_QUANTUM_RV(_q)        (_q * 2)
53 #define FQ_CODEL_QUANTUM_VI(_q)        (_q * 2)
54 #define FQ_CODEL_QUANTUM_VO(_q)        ((_q * 2) / 5)
55 #define FQ_CODEL_QUANTUM_CTL(_q)       ((_q * 2) / 5)
56 
57 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
58 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
59 
60 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
61     0, "FQ-CODEL parameters");
62 
63 SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
64     &ifclassq_enable_pacing, 0, "Enable pacing");
65 
66 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
67 #if (DEVELOPMENT || DEBUG)
68 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
69     CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
70 #endif /* !DEVELOPMENT && !DEBUG */
71 
72 unsigned int ifclassq_enable_pacing = 1;
73 
74 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
75 
76 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
77 static void fq_if_destroy(fq_if_t *fqs);
78 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
79     uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
80 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
81     int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
82     uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
83 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
84 static void fq_if_purge(fq_if_t *);
85 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
86 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
87     uint64_t);
88 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
89 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
90     fq_t *fq, uint64_t now);
91 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
92 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
93     bool purge_all);
94 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
95 static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq,
96     mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
97     classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
98     u_int32_t *retbytecnt, uint8_t grp_idx);
99 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
100     cqrq_stat_sc_t *stat, uint64_t now);
101 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
102 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
103 static void fq_if_destroy_grps(fq_if_t *fqs);
104 
105 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
106 	[FQ_IF_CTL_INDEX]       = 8,
107 	[FQ_IF_VO_INDEX]        = 8,
108 	[FQ_IF_VI_INDEX]        = 6,
109 	[FQ_IF_RV_INDEX]        = 6,
110 	[FQ_IF_AV_INDEX]        = 6,
111 	[FQ_IF_OAM_INDEX]       = 4,
112 	[FQ_IF_RD_INDEX]        = 4,
113 	[FQ_IF_BE_INDEX]        = 4,
114 	[FQ_IF_BK_INDEX]        = 2,
115 	[FQ_IF_BK_SYS_INDEX]    = 2,
116 };
117 
118 #define FQ_CODEL_DRR_MAX(_s)    fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
119 
120 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
121     fq_if_state state);
122 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
123     fq_if_state dst_state, fq_if_state src_state);
124 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
125     fq_if_state state);
126 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
127     fq_if_state state, fq_if_group_t **selected_grp);
128 static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
129     fq_if_state dst_state, fq_if_state src_state);
130 
131 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
132     fq_if_state state);
133 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
134     fq_if_state dst_state, fq_if_state src_state);
135 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
136     fq_if_state state);
137 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
138     fq_if_state state, fq_if_group_t **selected_grp);
139 static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
140     fq_if_state dst_state, fq_if_state src_state);
141 
142 bitmap_ops_t fq_if_grps_bitmap_ops =
143 {
144 	.ffs    = fq_if_grps_bitmap_ffs,
145 	.zeros  = fq_if_grps_bitmap_zeros,
146 	.cpy    = fq_if_grps_bitmap_cpy,
147 	.clr    = fq_if_grps_bitmap_clr,
148 	.move   = fq_if_grps_bitmap_move,
149 };
150 
151 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
152 {
153 	.ffs    = fq_if_grps_sc_bitmap_ffs,
154 	.zeros  = fq_if_grps_sc_bitmap_zeros,
155 	.cpy    = fq_if_grps_sc_bitmap_cpy,
156 	.clr    = fq_if_grps_sc_bitmap_clr,
157 	.move   = fq_if_grps_sc_bitmap_move,
158 };
159 
160 void
pktsched_fq_init(void)161 pktsched_fq_init(void)
162 {
163 	PE_parse_boot_argn("ifclassq_enable_pacing", &ifclassq_enable_pacing,
164 	    sizeof(ifclassq_enable_pacing));
165 
166 	// format looks like ifcq_drr_max=8,8,6
167 	char buf[(FQ_IF_MAX_CLASSES) * 3];
168 	size_t i, len, pri_index = 0;
169 	uint32_t drr = 0;
170 	if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
171 		return;
172 	}
173 
174 	len = strlen(buf);
175 	for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
176 		if (buf[i] != ',' && buf[i] != '\0') {
177 			VERIFY(buf[i] >= '0' && buf[i] <= '9');
178 			drr = drr * 10 + buf[i] - '0';
179 			continue;
180 		}
181 		fq_codel_drr_max_values[pri_index] = drr;
182 		pri_index += 1;
183 		drr = 0;
184 	}
185 }
186 
187 #define FQ_IF_FLOW_HASH_ID(_flowid_) \
188 	(((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK)
189 
190 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
191 	(STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
192 	STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
193 
194 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
195 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
196     int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
197     uint32_t *, boolean_t *, uint64_t);
198 
199 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)200 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
201 {
202 	pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
203 }
204 
205 static inline uint64_t
fq_codel_get_time(void)206 fq_codel_get_time(void)
207 {
208 	struct timespec ts;
209 	uint64_t now;
210 
211 	nanouptime(&ts);
212 	now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
213 	return now;
214 }
215 
216 #if SKYWALK
217 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)218 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
219 {
220 	pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
221 }
222 #endif /* SKYWALK */
223 
224 #if SKYWALK
225 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)226 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
227     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
228     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
229     boolean_t *qempty, uint64_t now)
230 {
231 	uint32_t plen;
232 	pktsched_pkt_t pkt;
233 	boolean_t limit_reached = FALSE;
234 	struct ifclassq *ifq = fqs->fqs_ifq;
235 	struct ifnet *ifp = ifq->ifcq_ifp;
236 
237 	/*
238 	 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
239 	 * all common flags need to be declared in that mask.
240 	 */
241 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
242 	    !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
243 		_PKTSCHED_PKT_INIT(&pkt);
244 		fq_getq_flow(fqs, fq, &pkt, now);
245 		ASSERT(pkt.pktsched_ptype == QP_PACKET);
246 
247 		plen = pktsched_get_pkt_len(&pkt);
248 		fq->fq_deficit -= plen;
249 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
250 			pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
251 			fq->fq_flags &= ~FQF_FRESH_FLOW;
252 		}
253 
254 		if (head->cp_kpkt == NULL) {
255 			*head = pkt.pktsched_pkt;
256 		} else {
257 			ASSERT(tail->cp_kpkt != NULL);
258 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
259 			tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
260 		}
261 		*tail = pkt.pktsched_pkt;
262 		tail->cp_kpkt->pkt_nextpkt = NULL;
263 		fq_cl->fcl_stat.fcl_dequeue++;
264 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
265 		*pkt_cnt += 1;
266 		*byte_cnt += plen;
267 
268 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
269 
270 		/* Check if the limit is reached */
271 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
272 			limit_reached = TRUE;
273 		}
274 	}
275 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
276 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
277 	    fq->fq_bytes, fq->fq_min_qdelay);
278 
279 	*qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
280 	return limit_reached;
281 }
282 #endif /* SKYWALK */
283 
284 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)285 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
286     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
287     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
288     boolean_t *qempty, uint64_t now)
289 {
290 	u_int32_t plen;
291 	pktsched_pkt_t pkt;
292 	boolean_t limit_reached = FALSE;
293 	struct ifclassq *ifq = fqs->fqs_ifq;
294 	struct ifnet *ifp = ifq->ifcq_ifp;
295 
296 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
297 	    !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
298 		_PKTSCHED_PKT_INIT(&pkt);
299 		fq_getq_flow(fqs, fq, &pkt, now);
300 		ASSERT(pkt.pktsched_ptype == QP_MBUF);
301 
302 		plen = pktsched_get_pkt_len(&pkt);
303 		fq->fq_deficit -= plen;
304 
305 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
306 			pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
307 			fq->fq_flags &= ~FQF_FRESH_FLOW;
308 		}
309 
310 		if (head->cp_mbuf == NULL) {
311 			*head = pkt.pktsched_pkt;
312 		} else {
313 			ASSERT(tail->cp_mbuf != NULL);
314 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
315 			tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
316 		}
317 		*tail = pkt.pktsched_pkt;
318 		tail->cp_mbuf->m_nextpkt = NULL;
319 		fq_cl->fcl_stat.fcl_dequeue++;
320 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
321 		*pkt_cnt += 1;
322 		*byte_cnt += plen;
323 
324 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
325 
326 		/* Check if the limit is reached */
327 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
328 			limit_reached = TRUE;
329 		}
330 	}
331 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
332 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
333 	    fq->fq_bytes, fq->fq_min_qdelay);
334 
335 	*qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
336 	return limit_reached;
337 }
338 
339 static void
fq_if_pacemaker_tcall(thread_call_param_t arg0,thread_call_param_t arg1)340 fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
341 {
342 #pragma unused(arg1)
343 	struct ifnet* ifp = (struct ifnet*)arg0;
344 	ASSERT(ifp != NULL);
345 
346 	ifnet_start_ignore_delay(ifp);
347 }
348 
349 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)350 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
351 {
352 	fq_if_t *fqs;
353 
354 	ASSERT(ifq->ifcq_ifp != NULL);
355 	fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
356 	fqs->fqs_ifq = ifq;
357 	fqs->fqs_ptype = ptype;
358 
359 	/* Configure packet drop limit across all queues */
360 	fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
361 	STAILQ_INIT(&fqs->fqs_fclist);
362 	TAILQ_INIT(&fqs->fqs_empty_list);
363 	TAILQ_INIT(&fqs->fqs_combined_grp_list);
364 	fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall,
365 	    (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL,
366 	    THREAD_CALL_OPTIONS_ONCE);
367 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
368 
369 	return fqs;
370 }
371 
372 void
fq_if_destroy(fq_if_t * fqs)373 fq_if_destroy(fq_if_t *fqs)
374 {
375 	struct ifnet    *ifp = fqs->fqs_ifq->ifcq_ifp;
376 	thread_call_t   tcall = fqs->fqs_pacemaker_tcall;
377 
378 	VERIFY(ifp != NULL);
379 	ASSERT(tcall != NULL);
380 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
381 	LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
382 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
383 
384 	/*
385 	 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
386 	 * and schedule a pacemaker call. So we do not need a sleep wait loop here
387 	 * cancel wait and free should succeed in one call.
388 	 */
389 	thread_call_cancel_wait(tcall);
390 	ASSERT(thread_call_free(tcall));
391 
392 	fq_if_purge(fqs);
393 	fq_if_destroy_grps(fqs);
394 
395 	fqs->fqs_ifq = NULL;
396 	zfree(fq_if_zone, fqs);
397 }
398 
399 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)400 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
401 {
402 	uint8_t pri;
403 
404 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
405 		switch (svc) {
406 		case MBUF_SC_BK_SYS:
407 		case MBUF_SC_BK:
408 			pri = FQ_IF_BK_INDEX;
409 			break;
410 		case MBUF_SC_BE:
411 		case MBUF_SC_RD:
412 		case MBUF_SC_OAM:
413 			pri = FQ_IF_BE_INDEX;
414 			break;
415 		case MBUF_SC_AV:
416 		case MBUF_SC_RV:
417 		case MBUF_SC_VI:
418 		case MBUF_SC_SIG:
419 			pri = FQ_IF_VI_INDEX;
420 			break;
421 		case MBUF_SC_VO:
422 		case MBUF_SC_CTL:
423 			pri = FQ_IF_VO_INDEX;
424 			break;
425 		default:
426 			pri = FQ_IF_BE_INDEX; /* Use best effort by default */
427 			break;
428 		}
429 		return pri;
430 	}
431 
432 	/* scheduler is not managed by the driver */
433 	switch (svc) {
434 	case MBUF_SC_BK_SYS:
435 		pri = FQ_IF_BK_SYS_INDEX;
436 		break;
437 	case MBUF_SC_BK:
438 		pri = FQ_IF_BK_INDEX;
439 		break;
440 	case MBUF_SC_BE:
441 		pri = FQ_IF_BE_INDEX;
442 		break;
443 	case MBUF_SC_RD:
444 		pri = FQ_IF_RD_INDEX;
445 		break;
446 	case MBUF_SC_OAM:
447 		pri = FQ_IF_OAM_INDEX;
448 		break;
449 	case MBUF_SC_AV:
450 		pri = FQ_IF_AV_INDEX;
451 		break;
452 	case MBUF_SC_RV:
453 		pri = FQ_IF_RV_INDEX;
454 		break;
455 	case MBUF_SC_VI:
456 		pri = FQ_IF_VI_INDEX;
457 		break;
458 	case MBUF_SC_SIG:
459 		pri = FQ_IF_SIG_INDEX;
460 		break;
461 	case MBUF_SC_VO:
462 		pri = FQ_IF_VO_INDEX;
463 		break;
464 	case MBUF_SC_CTL:
465 		pri = FQ_IF_CTL_INDEX;
466 		break;
467 	default:
468 		pri = FQ_IF_BE_INDEX; /* Use best effort by default */
469 		break;
470 	}
471 	return pri;
472 }
473 
474 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)475 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
476     uint32_t drr_max, uint32_t svc_class)
477 {
478 	fq_if_classq_t *fq_cl;
479 	VERIFY(pri < FQ_IF_MAX_CLASSES);
480 	fq_cl = &fqg->fqg_classq[pri];
481 
482 	VERIFY(fq_cl->fcl_quantum == 0);
483 	VERIFY(quantum != 0);
484 	fq_cl->fcl_quantum = quantum;
485 	fq_cl->fcl_pri = pri;
486 	fq_cl->fcl_drr_max = drr_max;
487 	fq_cl->fcl_service_class = svc_class;
488 	fq_cl->fcl_next_tx_time = 0;
489 	fq_cl->fcl_flags = 0;
490 	STAILQ_INIT(&fq_cl->fcl_new_flows);
491 	STAILQ_INIT(&fq_cl->fcl_old_flows);
492 }
493 
494 int
fq_if_enqueue_classq(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)495 fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
496     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
497 {
498 	uint8_t pri, grp_idx = 0;
499 	fq_if_t *fqs;
500 	fq_if_classq_t *fq_cl;
501 	fq_if_group_t *fq_group;
502 	int ret;
503 	mbuf_svc_class_t svc;
504 	pktsched_pkt_t pkt;
505 
506 	pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
507 
508 	fqs = (fq_if_t *)ifq->ifcq_disc;
509 	svc = pktsched_get_pkt_svc(&pkt);
510 #if SKYWALK
511 	if (head->cp_ptype == QP_PACKET) {
512 		grp_idx = head->cp_kpkt->pkt_qset_idx;
513 	}
514 #endif /* SKYWALK */
515 	pri = fq_if_service_to_priority(fqs, svc);
516 	VERIFY(pri < FQ_IF_MAX_CLASSES);
517 
518 	IFCQ_LOCK_SPIN(ifq);
519 	fq_group = fq_if_find_grp(fqs, grp_idx);
520 	fq_cl = &fq_group->fqg_classq[pri];
521 
522 	if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
523 		IFCQ_UNLOCK(ifq);
524 		/* BK_SYS is currently throttled */
525 		os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
526 		pktsched_free_pkt(&pkt);
527 		*pdrop = TRUE;
528 		ret = EQSUSPENDED;
529 		goto done;
530 	}
531 
532 	ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
533 	ret = fq_addq(fqs, fq_group, &pkt, fq_cl);
534 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
535 		if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
536 		    (1 << pri)) == 0) {
537 			/*
538 			 * this group is not in ER or EB groups,
539 			 * mark it as IB
540 			 */
541 			pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
542 		}
543 	}
544 
545 	if (__improbable(ret != 0)) {
546 		if (ret == CLASSQEQ_SUCCESS_FC) {
547 			/* packet enqueued, return advisory feedback */
548 			ret = EQFULL;
549 			*pdrop = FALSE;
550 		} else if (ret == CLASSQEQ_COMPRESSED) {
551 			ret = 0;
552 			*pdrop = FALSE;
553 		} else {
554 			IFCQ_UNLOCK(ifq);
555 			*pdrop = TRUE;
556 			pktsched_free_pkt(&pkt);
557 			switch (ret) {
558 			case CLASSQEQ_DROP:
559 				ret = ENOBUFS;
560 				goto done;
561 			case CLASSQEQ_DROP_FC:
562 				ret = EQFULL;
563 				goto done;
564 			case CLASSQEQ_DROP_SP:
565 				ret = EQSUSPENDED;
566 				goto done;
567 			default:
568 				VERIFY(0);
569 				/* NOTREACHED */
570 				__builtin_unreachable();
571 			}
572 			/* NOTREACHED */
573 			__builtin_unreachable();
574 		}
575 	} else {
576 		*pdrop = FALSE;
577 	}
578 	IFCQ_ADD_LEN(ifq, cnt);
579 	IFCQ_INC_BYTES(ifq, bytes);
580 
581 
582 	FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
583 	FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
584 
585 	IFCQ_UNLOCK(ifq);
586 done:
587 #if DEBUG || DEVELOPMENT
588 	if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
589 		ret = 0;
590 	}
591 #endif /* DEBUG || DEVELOPMENT */
592 	return ret;
593 }
594 
595 void
fq_if_dequeue_classq(struct ifclassq * ifq,classq_pkt_t * pkt,uint8_t grp_idx)596 fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx)
597 {
598 	(void) fq_if_dequeue_classq_multi(ifq, 1,
599 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
600 }
601 
602 void
fq_if_dequeue_sc_classq(struct ifclassq * ifq,mbuf_svc_class_t svc,classq_pkt_t * pkt,uint8_t grp_idx)603 fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
604     classq_pkt_t *pkt, uint8_t grp_idx)
605 {
606 	(void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1,
607 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
608 }
609 
610 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)611 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
612 {
613 	ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
614 	ASSERT(!fq->fq_in_dqlist);
615 	STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
616 	fq->fq_in_dqlist = true;
617 }
618 
619 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)620 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
621     classq_pkt_t *tail, classq_pkt_type_t ptype)
622 {
623 	ASSERT(fq->fq_in_dqlist);
624 	if (fq->fq_dq_head.cp_mbuf == NULL) {
625 		goto done;
626 	}
627 
628 	if (head->cp_mbuf == NULL) {
629 		*head = fq->fq_dq_head;
630 	} else {
631 		ASSERT(tail->cp_mbuf != NULL);
632 
633 		switch (ptype) {
634 		case QP_MBUF:
635 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
636 			tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
637 			ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
638 			break;
639 #if SKYWALK
640 		case QP_PACKET:
641 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
642 			tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
643 			ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
644 			break;
645 #endif /* SKYWALK */
646 		default:
647 			VERIFY(0);
648 			/* NOTREACHED */
649 			__builtin_unreachable();
650 		}
651 	}
652 	*tail = fq->fq_dq_tail;
653 done:
654 	STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
655 	CLASSQ_PKT_INIT(&fq->fq_dq_head);
656 	CLASSQ_PKT_INIT(&fq->fq_dq_tail);
657 	fq->fq_in_dqlist = false;
658 }
659 
660 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)661 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
662     classq_pkt_t *tail, classq_pkt_type_t ptype)
663 {
664 	fq_t *fq, *tfq;
665 
666 	STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
667 		fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
668 	}
669 }
670 
671 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)672 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
673     fq_if_group_t **selected_grp)
674 {
675 	#pragma unused(pri)
676 
677 	fq_if_group_t *grp;
678 	uint32_t highest_pri = FQ_IF_MAX_CLASSES;
679 	int ret_pri = 0;
680 
681 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
682 		uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
683 		/* bitmap is empty in this case */
684 		if (cur_pri == 0) {
685 			continue;
686 		}
687 		if (cur_pri <= highest_pri) {
688 			highest_pri = cur_pri;
689 			ret_pri = cur_pri;
690 			*selected_grp = grp;
691 		}
692 	}
693 	return ret_pri;
694 }
695 
696 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)697 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
698 {
699     #pragma unused(pri)
700 
701 	fq_if_group_t *grp;
702 
703 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
704 		if (grp->fqg_bitmaps[state] != 0) {
705 			return FALSE;
706 		}
707 	}
708 	return TRUE;
709 }
710 
711 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)712 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
713     fq_if_state src_state)
714 {
715     #pragma unused(pri)
716 
717 	fq_if_group_t *grp;
718 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
719 		grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
720 	}
721 }
722 
723 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)724 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
725 {
726     #pragma unused(pri)
727 
728 	fq_if_group_t *grp;
729 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
730 		grp->fqg_bitmaps[state] = 0;
731 	}
732 }
733 
734 static void
fq_if_grps_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)735 fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
736     fq_if_state src_state)
737 {
738     #pragma unused(pri)
739 
740 	fq_if_group_t *grp;
741 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
742 		grp->fqg_bitmaps[dst_state] =
743 		    grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
744 		grp->fqg_bitmaps[src_state] = 0;
745 	}
746 }
747 
748 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)749 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
750     fq_if_group_t **selected_grp)
751 {
752 	fq_if_group_t *grp;
753 	int ret_pri = 0;
754 
755 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
756 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
757 			/* +1 to match the semantics of pktsched_ffs */
758 			ret_pri = pri + 1;
759 			*selected_grp = grp;
760 			break;
761 		}
762 	}
763 
764 	return ret_pri;
765 }
766 
767 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)768 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
769 {
770 	fq_if_group_t *grp;
771 
772 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
773 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
774 			return FALSE;
775 		}
776 	}
777 	return TRUE;
778 }
779 
780 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)781 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
782     fq_if_state src_state)
783 {
784 	fq_if_group_t *grp;
785 
786 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
787 		pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
788 		    &grp->fqg_bitmaps[src_state]);
789 	}
790 }
791 
792 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)793 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
794 {
795 	fq_if_group_t *grp;
796 
797 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
798 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
799 	}
800 }
801 
802 static void
fq_if_grps_sc_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)803 fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
804     fq_if_state src_state)
805 {
806 	fq_if_group_t *grp;
807 
808 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
809 		pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state],
810 		    &grp->fqg_bitmaps[src_state]);
811 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]);
812 	}
813 }
814 
815 /*
816  * Pacemaker is only scheduled when no packet can be dequeued from AQM
817  * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
818  * This only applies to L4S traffic at this moment.
819  */
820 static void
fq_if_schedule_pacemaker(fq_if_t * fqs,uint64_t now,uint64_t next_tx_time)821 fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
822 {
823 	uint64_t deadline = 0;
824 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
825 		return;
826 	}
827 	ASSERT(next_tx_time != FQ_INVALID_TX_TS);
828 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
829 	ASSERT(now < next_tx_time);
830 
831 	DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
832 	    uint64_t, next_tx_time - now);
833 	KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
834 	    next_tx_time, next_tx_time - now);
835 
836 	clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline);
837 	thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline);
838 }
839 
840 static int
fq_if_dequeue_classq_multi_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)841 fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
842     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
843     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
844     uint8_t grp_idx)
845 {
846 	uint32_t total_pktcnt = 0, total_bytecnt = 0;
847 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
848 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
849 	classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
850 	fq_if_append_pkt_t append_pkt;
851 	flowq_dqlist_t fq_dqlist_head;
852 	fq_if_classq_t *fq_cl;
853 	fq_grp_tailq_t *grp_list, tmp_grp_list;
854 	fq_if_group_t *fq_grp = NULL;
855 	fq_if_t *fqs;
856 	uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
857 	int pri = 0, svc_pri = 0;
858 	bool all_paced = true;
859 
860 	IFCQ_LOCK_ASSERT_HELD(ifq);
861 
862 	fqs = (fq_if_t *)ifq->ifcq_disc;
863 	STAILQ_INIT(&fq_dqlist_head);
864 
865 	switch (fqs->fqs_ptype) {
866 	case QP_MBUF:
867 		append_pkt = fq_if_append_mbuf;
868 		break;
869 
870 #if SKYWALK
871 	case QP_PACKET:
872 		append_pkt = fq_if_append_pkt;
873 		break;
874 #endif /* SKYWALK */
875 
876 	default:
877 		VERIFY(0);
878 		/* NOTREACHED */
879 		__builtin_unreachable();
880 	}
881 
882 	now = fq_codel_get_time();
883 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
884 		svc_pri = fq_if_service_to_priority(fqs, svc);
885 	} else {
886 		VERIFY(svc == MBUF_SC_UNSPEC);
887 	}
888 
889 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
890 		grp_list = &fqs->fqs_combined_grp_list;
891 		VERIFY(!TAILQ_EMPTY(grp_list));
892 	} else {
893 		grp_list = &tmp_grp_list;
894 		fq_grp = fq_if_find_grp(fqs, grp_idx);
895 		TAILQ_INIT(grp_list);
896 		TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
897 	}
898 
899 	for (;;) {
900 		uint32_t pktcnt = 0, bytecnt = 0;
901 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
902 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
903 		bool fq_cl_all_paced = false;
904 		uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
905 
906 		if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
907 		    fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
908 			fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
909 			fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
910 			if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
911 				if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
912 					/*
913 					 * Move fq_cl in IR back to ER, so that they will inspected with priority
914 					 * the next time the driver dequeues
915 					 */
916 					fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
917 					fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
918 				}
919 				break;
920 			}
921 		}
922 		pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
923 		if (pri == 0) {
924 			/*
925 			 * There are no ER flows, move the highest
926 			 * priority one from EB if there are any in that
927 			 * category
928 			 */
929 			pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
930 			VERIFY(pri > 0);
931 			VERIFY(fq_grp != NULL);
932 			pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
933 			pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
934 		}
935 		VERIFY(fq_grp != NULL);
936 		pri--; /* index starts at 0 */
937 		fq_cl = &fq_grp->fqg_classq[pri];
938 
939 		if (fq_cl->fcl_budget <= 0) {
940 			/* Update the budget */
941 			fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
942 			    fq_cl->fcl_stat.fcl_flows_cnt) *
943 			    fq_cl->fcl_quantum);
944 			if (fq_cl->fcl_budget <= 0) {
945 				goto state_change;
946 			}
947 		}
948 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
949 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
950 		    &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
951 		    &fq_cl_next_tx_time);
952 		if (head.cp_mbuf != NULL) {
953 			ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
954 			if (first.cp_mbuf == NULL) {
955 				first = head;
956 			} else {
957 				ASSERT(last.cp_mbuf != NULL);
958 				append_pkt(&last, &head);
959 			}
960 			last = tail;
961 			append_pkt(&last, &tmp);
962 		}
963 		if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
964 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
965 			next_tx_time = fq_cl_next_tx_time;
966 		}
967 		fq_cl->fcl_budget -= bytecnt;
968 		total_pktcnt += pktcnt;
969 		total_bytecnt += bytecnt;
970 
971 		/*
972 		 * If the class has exceeded the budget but still has data
973 		 * to send, move it to IB
974 		 */
975 state_change:
976 		VERIFY(fq_grp != NULL);
977 		all_paced &= fq_cl_all_paced;
978 		if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
979 			if (fq_cl->fcl_budget <= 0) {
980 				pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
981 				pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
982 			} else if (fq_cl_all_paced) {
983 				if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
984 					/*
985 					 * If a fq_cl still has budget but only paced queues, park it
986 					 * to IR so that we will not keep loopping over it
987 					 */
988 					pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]);
989 					pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
990 				}
991 			}
992 		} else {
993 			pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
994 			VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
995 			    fq_grp->fqg_bitmaps[FQ_IF_EB] |
996 			    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
997 			fq_cl->fcl_budget = 0;
998 		}
999 		if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1000 			if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1001 				/*
1002 				 * Move fq_cl in IR back to ER, so that they will inspected with priority
1003 				 * the next time the driver dequeues
1004 				 */
1005 				fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1006 			}
1007 			break;
1008 		}
1009 	}
1010 
1011 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1012 		TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1013 		VERIFY(TAILQ_EMPTY(grp_list));
1014 	}
1015 
1016 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
1017 	    fqs->fqs_ptype);
1018 
1019 	if (__probable(first_packet != NULL)) {
1020 		*first_packet = first;
1021 	}
1022 	if (last_packet != NULL) {
1023 		*last_packet = last;
1024 	}
1025 	if (retpktcnt != NULL) {
1026 		*retpktcnt = total_pktcnt;
1027 	}
1028 	if (retbytecnt != NULL) {
1029 		*retbytecnt = total_bytecnt;
1030 	}
1031 	if (next_tx_time != FQ_INVALID_TX_TS) {
1032 		ASSERT(next_tx_time > now);
1033 		fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1034 	}
1035 
1036 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1037 	fq_if_purge_empty_flow_list(fqs, now, false);
1038 	return 0;
1039 }
1040 
1041 int
fq_if_dequeue_classq_multi(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1042 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
1043     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1044     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1045     u_int32_t *retbytecnt, uint8_t grp_idx)
1046 {
1047 	return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1048 	           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1049 }
1050 
1051 int
fq_if_dequeue_sc_classq_multi(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1052 fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
1053     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1054     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1055     uint8_t grp_idx)
1056 {
1057 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1058 
1059 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
1060 		return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt,
1061 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1062 	} else {
1063 		/*
1064 		 * take a shortcut here since there is no need to schedule
1065 		 * one single service class.
1066 		 */
1067 		return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt,
1068 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1069 	}
1070 }
1071 
1072 static int
fq_if_dequeue_sc_classq_multi_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1073 fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1074     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1075     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1076     uint8_t grp_idx)
1077 {
1078 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1079 	uint8_t pri;
1080 	u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1081 	fq_if_classq_t *fq_cl;
1082 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1083 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1084 	fq_if_append_pkt_t append_pkt;
1085 	flowq_dqlist_t fq_dqlist_head;
1086 	fq_if_group_t *fq_grp;
1087 	uint64_t now;
1088 
1089 	switch (fqs->fqs_ptype) {
1090 	case QP_MBUF:
1091 		append_pkt = fq_if_append_mbuf;
1092 		break;
1093 
1094 #if SKYWALK
1095 	case QP_PACKET:
1096 		append_pkt = fq_if_append_pkt;
1097 		break;
1098 #endif /* SKYWALK */
1099 
1100 	default:
1101 		VERIFY(0);
1102 		/* NOTREACHED */
1103 		__builtin_unreachable();
1104 	}
1105 
1106 	STAILQ_INIT(&fq_dqlist_head);
1107 	now = fq_codel_get_time();
1108 
1109 	pri = fq_if_service_to_priority(fqs, svc);
1110 	fq_grp = fq_if_find_grp(fqs, grp_idx);
1111 	fq_cl = &fq_grp->fqg_classq[pri];
1112 
1113 	/*
1114 	 * Now we have the queue for a particular service class. We need
1115 	 * to dequeue as many packets as needed, first from the new flows
1116 	 * and then from the old flows.
1117 	 */
1118 	while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1119 	    fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1120 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1121 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1122 		u_int32_t pktcnt = 0, bytecnt = 0;
1123 		bool all_paced = false;
1124 		uint64_t next_tx_time = FQ_INVALID_TX_TS;
1125 
1126 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1127 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1128 		    &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1129 		if (head.cp_mbuf != NULL) {
1130 			if (first.cp_mbuf == NULL) {
1131 				first = head;
1132 			} else {
1133 				ASSERT(last.cp_mbuf != NULL);
1134 				append_pkt(&last, &head);
1135 			}
1136 			last = tail;
1137 		}
1138 		total_pktcnt += pktcnt;
1139 		total_bytecnt += bytecnt;
1140 
1141 		if (next_tx_time != FQ_INVALID_TX_TS) {
1142 			ASSERT(next_tx_time > now);
1143 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1144 			fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1145 			break;
1146 		}
1147 	}
1148 
1149 	/*
1150 	 * Mark classq as IB if it's not idle, so that we can
1151 	 * start without re-init the bitmaps when it's switched
1152 	 * to combined mode.
1153 	 */
1154 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1155 		pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1156 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1157 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1158 	} else {
1159 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1160 		VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1161 		    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1162 		    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1163 	}
1164 
1165 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1166 
1167 	if (__probable(first_packet != NULL)) {
1168 		*first_packet = first;
1169 	}
1170 	if (last_packet != NULL) {
1171 		*last_packet = last;
1172 	}
1173 	if (retpktcnt != NULL) {
1174 		*retpktcnt = total_pktcnt;
1175 	}
1176 	if (retbytecnt != NULL) {
1177 		*retbytecnt = total_bytecnt;
1178 	}
1179 
1180 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1181 	fq_if_purge_empty_flow_list(fqs, now, false);
1182 	return 0;
1183 }
1184 
1185 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1186 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1187     uint32_t *bytesp, uint64_t now)
1188 {
1189 	fq_if_classq_t *fq_cl;
1190 	u_int32_t pkts, bytes;
1191 	pktsched_pkt_t pkt;
1192 	fq_if_group_t *grp;
1193 
1194 	fq_cl = &FQ_CLASSQ(fq);
1195 	grp = FQ_GROUP(fq);
1196 	pkts = bytes = 0;
1197 	_PKTSCHED_PKT_INIT(&pkt);
1198 	for (;;) {
1199 		fq_getq_flow(fqs, fq, &pkt, now);
1200 		if (pkt.pktsched_pkt_mbuf == NULL) {
1201 			VERIFY(pkt.pktsched_ptype == QP_INVALID);
1202 			break;
1203 		}
1204 		pkts++;
1205 		bytes += pktsched_get_pkt_len(&pkt);
1206 		pktsched_free_pkt(&pkt);
1207 	}
1208 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1209 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1210 
1211 	IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1212 
1213 	/* move through the flow queue states */
1214 	VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1215 	if (fq->fq_flags & FQF_NEW_FLOW) {
1216 		fq_if_empty_new_flow(fq, fq_cl);
1217 	}
1218 	if (fq->fq_flags & FQF_OLD_FLOW) {
1219 		fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1220 	}
1221 	if (fq->fq_flags & FQF_EMPTY_FLOW) {
1222 		fq_if_purge_empty_flow(fqs, fq);
1223 		fq = NULL;
1224 	}
1225 
1226 	if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1227 		int i;
1228 		for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1229 			pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1230 		}
1231 	}
1232 
1233 	if (pktsp != NULL) {
1234 		*pktsp = pkts;
1235 	}
1236 	if (bytesp != NULL) {
1237 		*bytesp = bytes;
1238 	}
1239 }
1240 
1241 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1242 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1243 {
1244 	fq_t *fq, *tfq;
1245 	uint64_t now;
1246 
1247 	now = fq_codel_get_time();
1248 	/*
1249 	 * Take each flow from new/old flow list and flush mbufs
1250 	 * in that flow
1251 	 */
1252 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1253 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1254 	}
1255 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1256 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1257 	}
1258 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1259 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1260 
1261 	STAILQ_INIT(&fq_cl->fcl_new_flows);
1262 	STAILQ_INIT(&fq_cl->fcl_old_flows);
1263 	fq_cl->fcl_budget = 0;
1264 }
1265 
1266 static void
fq_if_purge(fq_if_t * fqs)1267 fq_if_purge(fq_if_t *fqs)
1268 {
1269 	uint64_t now;
1270 	fq_if_group_t *grp;
1271 	int i;
1272 
1273 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1274 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1275 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1276 			continue;
1277 		}
1278 
1279 		grp = fq_if_find_grp(fqs, grp_idx);
1280 		fq_if_purge_grp(fqs, grp);
1281 	}
1282 
1283 	now = fq_codel_get_time();
1284 	fq_if_purge_empty_flow_list(fqs, now, true);
1285 
1286 	VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1287 	VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1288 
1289 	fqs->fqs_large_flow = NULL;
1290 	for (i = 0; i < FQ_IF_HASH_TABLE_SIZE; i++) {
1291 		VERIFY(SLIST_EMPTY(&fqs->fqs_flows[i]));
1292 	}
1293 
1294 	IFCQ_LEN(fqs->fqs_ifq) = 0;
1295 	IFCQ_BYTES(fqs->fqs_ifq) = 0;
1296 }
1297 
1298 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1299 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1300 {
1301 	fq_t *fq;
1302 	uint64_t now;
1303 	fq_if_group_t *grp;
1304 
1305 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1306 	req->packets = req->bytes = 0;
1307 	VERIFY(req->flow != 0);
1308 
1309 	now = fq_codel_get_time();
1310 
1311 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1312 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1313 			continue;
1314 		}
1315 		uint32_t bytes = 0, pkts = 0;
1316 
1317 		grp = fq_if_find_grp(fqs, grp_idx);
1318 		/*
1319 		 * Packet and traffic type are needed only if we want
1320 		 * to create a flow queue.
1321 		 */
1322 		fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C);
1323 		if (fq != NULL) {
1324 			fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1325 			req->bytes += bytes;
1326 			req->packets += pkts;
1327 		}
1328 	}
1329 }
1330 
1331 static uint16_t
fq_if_calc_quantum(struct ifnet * ifp)1332 fq_if_calc_quantum(struct ifnet *ifp)
1333 {
1334 	uint16_t quantum;
1335 
1336 	switch (ifp->if_family) {
1337 	case IFNET_FAMILY_ETHERNET:
1338 		VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX);
1339 		quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN;
1340 		break;
1341 
1342 	case IFNET_FAMILY_CELLULAR:
1343 	case IFNET_FAMILY_IPSEC:
1344 	case IFNET_FAMILY_UTUN:
1345 		VERIFY(ifp->if_mtu <= UINT16_MAX);
1346 		quantum = (uint16_t)ifp->if_mtu;
1347 		break;
1348 
1349 	default:
1350 		quantum = FQ_CODEL_DEFAULT_QUANTUM;
1351 		break;
1352 	}
1353 
1354 	if ((ifp->if_hwassist & IFNET_TSOF) != 0) {
1355 		VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1356 		VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1357 		quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1358 		quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1359 	}
1360 
1361 	quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1362 #if DEBUG || DEVELOPMENT
1363 	quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1364 #endif /* DEBUG || DEVELOPMENT */
1365 	VERIFY(quantum != 0);
1366 	return quantum;
1367 }
1368 
1369 static void
fq_if_mtu_update(fq_if_t * fqs)1370 fq_if_mtu_update(fq_if_t *fqs)
1371 {
1372 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q)                     \
1373 	(_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum =        \
1374 	    FQ_CODEL_QUANTUM_ ## _s(_q)                                 \
1375 
1376 	uint32_t quantum;
1377 	fq_if_group_t *grp;
1378 
1379 	quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1380 
1381 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1382 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1383 			continue;
1384 		}
1385 
1386 		grp = fq_if_find_grp(fqs, grp_idx);
1387 
1388 		if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1389 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1390 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1391 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1392 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1393 		} else {
1394 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1395 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1396 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1397 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1398 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1399 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1400 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1401 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1402 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1403 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1404 		}
1405 	}
1406 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1407 }
1408 
1409 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1410 fq_if_event(fq_if_t *fqs, cqev_t ev)
1411 {
1412 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1413 
1414 	switch (ev) {
1415 	case CLASSQ_EV_LINK_UP:
1416 	case CLASSQ_EV_LINK_DOWN:
1417 		fq_if_purge(fqs);
1418 		break;
1419 	case CLASSQ_EV_LINK_MTU:
1420 		fq_if_mtu_update(fqs);
1421 		break;
1422 	default:
1423 		break;
1424 	}
1425 }
1426 
1427 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1428 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1429 {
1430 	fq_if_purge_classq(fqs, fq_cl);
1431 	fqs->fqs_throttle = 1;
1432 	fq_cl->fcl_stat.fcl_throttle_on++;
1433 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1434 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1435 }
1436 
1437 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1438 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1439 {
1440 	VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1441 	fqs->fqs_throttle = 0;
1442 	fq_cl->fcl_stat.fcl_throttle_off++;
1443 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1444 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1445 }
1446 
1447 
1448 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1449 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1450 {
1451 	struct ifclassq *ifq = fqs->fqs_ifq;
1452 	uint8_t index;
1453 	fq_if_group_t *grp;
1454 
1455 #if !MACH_ASSERT
1456 #pragma unused(ifq)
1457 #endif
1458 	IFCQ_LOCK_ASSERT_HELD(ifq);
1459 
1460 	if (!tr->set) {
1461 		tr->level = fqs->fqs_throttle;
1462 		return 0;
1463 	}
1464 
1465 	if (tr->level == fqs->fqs_throttle) {
1466 		return EALREADY;
1467 	}
1468 
1469 	/* Throttling is allowed on BK_SYS class only */
1470 	index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1471 
1472 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1473 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1474 			continue;
1475 		}
1476 		grp = fq_if_find_grp(fqs, grp_idx);
1477 		switch (tr->level) {
1478 		case IFNET_THROTTLE_OFF:
1479 			fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1480 			break;
1481 		case IFNET_THROTTLE_OPPORTUNISTIC:
1482 			fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1483 			break;
1484 		default:
1485 			break;
1486 		}
1487 	}
1488 	return 0;
1489 }
1490 
1491 static inline boolean_t
fq_if_is_fq_cl_paced(fq_if_classq_t * fq_cl,uint64_t now)1492 fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1493 {
1494 	if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1495 		return true;
1496 	}
1497 
1498 	fq_cl->fcl_flags &= ~FCL_PACED;
1499 	fq_cl->fcl_next_tx_time = 0;
1500 	return false;
1501 }
1502 
1503 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat,uint64_t now)1504 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1505 {
1506 	uint8_t pri;
1507 	fq_if_classq_t *fq_cl;
1508 
1509 	ASSERT(stat != NULL);
1510 	pri = fq_if_service_to_priority(fqs, stat->sc);
1511 
1512 	fq_cl = &grp->fqg_classq[pri];
1513 	stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1514 	stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1515 
1516 	if (ifclassq_enable_pacing && ifclassq_enable_l4s &&
1517 	    fq_if_is_fq_cl_paced(fq_cl, now)) {
1518 		stat->packets = 0;
1519 		stat->bytes = 0;
1520 	}
1521 }
1522 
1523 static boolean_t
fq_if_is_grp_all_paced(fq_if_group_t * grp)1524 fq_if_is_grp_all_paced(fq_if_group_t *grp)
1525 {
1526 	fq_if_classq_t *fq_cl;
1527 	uint64_t now;
1528 
1529 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1530 		return false;
1531 	}
1532 
1533 	now = fq_codel_get_time();
1534 	for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1535 		fq_cl = &grp->fqg_classq[fq_cl_idx];
1536 		if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1537 			continue;
1538 		}
1539 		if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1540 			return false;
1541 		}
1542 	}
1543 
1544 	return true;
1545 }
1546 
1547 boolean_t
fq_if_is_all_paced(struct ifclassq * ifq)1548 fq_if_is_all_paced(struct ifclassq *ifq)
1549 {
1550 	fq_if_group_t *grp;
1551 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1552 
1553 	IFCQ_LOCK_ASSERT_HELD(ifq);
1554 
1555 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1556 		return false;
1557 	}
1558 
1559 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1560 		grp = fqs->fqs_classq_groups[grp_idx];
1561 		if (grp == NULL || FQG_BYTES(grp) == 0) {
1562 			continue;
1563 		}
1564 
1565 		if (!fq_if_is_grp_all_paced(grp)) {
1566 			return false;
1567 		}
1568 	}
1569 
1570 	return true;
1571 }
1572 
1573 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1574 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1575 {
1576 	cqrq_stat_sc_t grp_sc_stat;
1577 	fq_if_group_t *grp;
1578 	uint64_t now = fq_codel_get_time();
1579 
1580 	if (stat == NULL) {
1581 		return;
1582 	}
1583 	grp_sc_stat.sc = stat->sc;
1584 	stat->packets = 0;
1585 	stat->bytes = 0;
1586 
1587 	if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1588 		if (stat->sc == MBUF_SC_UNSPEC) {
1589 			if (!fq_if_is_all_paced(fqs->fqs_ifq)) {
1590 				stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1591 				stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1592 			}
1593 		} else {
1594 			for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1595 				grp = fqs->fqs_classq_groups[grp_idx];
1596 				if (grp == NULL) {
1597 					continue;
1598 				}
1599 
1600 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1601 				stat->packets += grp_sc_stat.packets;
1602 				stat->bytes += grp_sc_stat.bytes;
1603 			}
1604 		}
1605 		return;
1606 	}
1607 
1608 	if (stat->sc == MBUF_SC_UNSPEC) {
1609 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1610 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1611 				if (fq_if_is_grp_all_paced(grp)) {
1612 					continue;
1613 				}
1614 				stat->packets += FQG_LEN(grp);
1615 				stat->bytes += FQG_BYTES(grp);
1616 			}
1617 		} else {
1618 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1619 			if (!fq_if_is_grp_all_paced(grp)) {
1620 				stat->packets = FQG_LEN(grp);
1621 				stat->bytes = FQG_BYTES(grp);
1622 			}
1623 		}
1624 	} else {
1625 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1626 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1627 				if (fq_if_is_grp_all_paced(grp)) {
1628 					continue;
1629 				}
1630 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1631 				stat->packets += grp_sc_stat.packets;
1632 				stat->bytes += grp_sc_stat.bytes;
1633 			}
1634 		} else {
1635 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1636 			fq_if_grp_stat_sc(fqs, grp, stat, now);
1637 		}
1638 	}
1639 }
1640 
1641 int
fq_if_request_classq(struct ifclassq * ifq,cqrq_t rq,void * arg)1642 fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg)
1643 {
1644 	int err = 0;
1645 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1646 
1647 	IFCQ_LOCK_ASSERT_HELD(ifq);
1648 
1649 	/*
1650 	 * These are usually slow operations, convert the lock ahead of time
1651 	 */
1652 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1653 	switch (rq) {
1654 	case CLASSQRQ_PURGE:
1655 		fq_if_purge(fqs);
1656 		break;
1657 	case CLASSQRQ_PURGE_SC:
1658 		fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1659 		break;
1660 	case CLASSQRQ_EVENT:
1661 		fq_if_event(fqs, (cqev_t)arg);
1662 		break;
1663 	case CLASSQRQ_THROTTLE:
1664 		fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1665 		break;
1666 	case CLASSQRQ_STAT_SC:
1667 		fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1668 		break;
1669 	}
1670 	return err;
1671 }
1672 
1673 int
fq_if_setup_ifclassq(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)1674 fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
1675     classq_pkt_type_t ptype)
1676 {
1677 	fq_if_t *fqs = NULL;
1678 	int err = 0;
1679 
1680 	IFCQ_LOCK_ASSERT_HELD(ifq);
1681 	VERIFY(ifq->ifcq_disc == NULL);
1682 	VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1683 
1684 	fqs = fq_if_alloc(ifq, ptype);
1685 	if (fqs == NULL) {
1686 		return ENOMEM;
1687 	}
1688 	if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1689 		fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1690 		fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1691 	} else {
1692 		fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1693 	}
1694 
1695 	err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1696 	if (err != 0) {
1697 		os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
1698 		    "failed to attach fq_if: %d\n", __func__, err);
1699 		fq_if_destroy(fqs);
1700 		return err;
1701 	}
1702 
1703 	/*
1704 	 * Always create one group. If qset 0 is added later,
1705 	 * this group will be updated.
1706 	 */
1707 	err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
1708 	if (err != 0) {
1709 		os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
1710 		    "failed to create a fq group: %d\n", __func__, err);
1711 		fq_if_destroy(fqs);
1712 	}
1713 
1714 	return err;
1715 }
1716 
1717 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,u_int32_t flowid,mbuf_svc_class_t svc_class,u_int64_t now,bool create,fq_tfc_type_t tfc_type)1718 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, u_int32_t flowid,
1719     mbuf_svc_class_t svc_class, u_int64_t now, bool create,
1720     fq_tfc_type_t tfc_type)
1721 {
1722 	fq_t *fq = NULL;
1723 	flowq_list_t *fq_list;
1724 	fq_if_classq_t *fq_cl;
1725 	u_int8_t fqs_hash_id;
1726 	u_int8_t scidx;
1727 
1728 	scidx = fq_if_service_to_priority(fqs, svc_class);
1729 
1730 	fqs_hash_id = FQ_IF_FLOW_HASH_ID(flowid);
1731 
1732 	fq_list = &fqs->fqs_flows[fqs_hash_id];
1733 
1734 	SLIST_FOREACH(fq, fq_list, fq_hashlink) {
1735 		if (fq->fq_flowhash == flowid &&
1736 		    fq->fq_sc_index == scidx &&
1737 		    fq->fq_tfc_type == tfc_type &&
1738 		    fq->fq_group == fq_grp) {
1739 			break;
1740 		}
1741 	}
1742 	if (fq == NULL && create) {
1743 		/* If the flow is not already on the list, allocate it */
1744 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1745 		fq = fq_alloc(fqs->fqs_ptype);
1746 		if (fq != NULL) {
1747 			fq->fq_flowhash = flowid;
1748 			fq->fq_sc_index = scidx;
1749 			fq->fq_group = fq_grp;
1750 			fq->fq_tfc_type = tfc_type;
1751 			fq_cl = &FQ_CLASSQ(fq);
1752 			fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
1753 			fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1754 			fq->fq_next_tx_time = FQ_INVALID_TX_TS;
1755 			SLIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
1756 			fq_cl->fcl_stat.fcl_flows_cnt++;
1757 		}
1758 		KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
1759 		    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1760 		    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1761 	} else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
1762 		fq_if_reuse_empty_flow(fqs, fq, now);
1763 	}
1764 
1765 	/*
1766 	 * If getq time is not set because this is the first packet or after
1767 	 * idle time, set it now so that we can detect a stall.
1768 	 */
1769 	if (fq != NULL && fq->fq_getqtime == 0) {
1770 		fq->fq_getqtime = now;
1771 	}
1772 
1773 	return fq;
1774 }
1775 
1776 void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)1777 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
1778 {
1779 	u_int8_t hash_id;
1780 
1781 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
1782 	hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash);
1783 	SLIST_REMOVE(&fqs->fqs_flows[hash_id], fq, flowq,
1784 	    fq_hashlink);
1785 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1786 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1787 		fq_if_flow_feedback(fqs, fq, fq_cl);
1788 	}
1789 	KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
1790 	    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1791 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1792 	fq_destroy(fq, fqs->fqs_ptype);
1793 }
1794 
1795 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)1796 fq_if_at_drop_limit(fq_if_t *fqs)
1797 {
1798 	return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
1799 	       TRUE : FALSE;
1800 }
1801 
1802 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)1803 fq_if_almost_at_drop_limit(fq_if_t *fqs)
1804 {
1805 	/*
1806 	 * Whether we are above 90% of the queue limit. This is used to tell if we
1807 	 * can stop flow controlling the largest flow.
1808 	 */
1809 	return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
1810 }
1811 
1812 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)1813 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
1814 {
1815 	ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
1816 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1817 	STAILQ_NEXT(fq, fq_actlink) = NULL;
1818 	fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
1819 	fq->fq_empty_purge_time = 0;
1820 	fq->fq_getqtime = 0;
1821 	fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1822 	fqs->fqs_empty_list_cnt--;
1823 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
1824 	fq_cl->fcl_stat.fcl_flows_cnt++;
1825 }
1826 
1827 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1828 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1829     uint64_t now)
1830 {
1831 	ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
1832 	fq->fq_empty_purge_time = now + fq_empty_purge_delay;
1833 	TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
1834 	fq->fq_flags |= FQF_EMPTY_FLOW;
1835 	FQ_CLEAR_OVERWHELMING(fq);
1836 	fqs->fqs_empty_list_cnt++;
1837 	/*
1838 	 * fcl_flows_cnt is used in budget determination for the class.
1839 	 * empty flow shouldn't contribute to the budget.
1840 	 */
1841 	fq_cl->fcl_stat.fcl_flows_cnt--;
1842 }
1843 
1844 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)1845 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
1846 {
1847 	fq_if_classq_t *fq_cl;
1848 	fq_cl = &FQ_CLASSQ(fq);
1849 
1850 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
1851 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1852 	fq->fq_flags &= ~FQF_EMPTY_FLOW;
1853 	fqs->fqs_empty_list_cnt--;
1854 	/* Remove from the hash list and free the flow queue */
1855 	fq_if_destroy_flow(fqs, fq_cl, fq);
1856 }
1857 
1858 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)1859 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
1860 {
1861 	fq_t *fq, *tmp;
1862 	int i = 0;
1863 
1864 	if (fqs->fqs_empty_list_cnt == 0) {
1865 		ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
1866 		return;
1867 	}
1868 
1869 	TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
1870 		if (!purge_all && ((now < fq->fq_empty_purge_time) ||
1871 		    (i++ == FQ_EMPTY_PURGE_MAX))) {
1872 			break;
1873 		}
1874 		fq_if_purge_empty_flow(fqs, fq);
1875 	}
1876 
1877 	if (__improbable(purge_all)) {
1878 		VERIFY(fqs->fqs_empty_list_cnt == 0);
1879 		VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1880 	}
1881 }
1882 
1883 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1884 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1885     uint64_t now)
1886 {
1887 	/*
1888 	 * Remove the flow queue from the old flows list.
1889 	 */
1890 	STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
1891 	fq->fq_flags &= ~FQF_OLD_FLOW;
1892 	fq_cl->fcl_stat.fcl_oldflows_cnt--;
1893 	VERIFY(fq->fq_bytes == 0);
1894 
1895 	/* release any flow control */
1896 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1897 		fq_if_flow_feedback(fqs, fq, fq_cl);
1898 	}
1899 
1900 	/* move the flow queue to empty flows list */
1901 	fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
1902 }
1903 
1904 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)1905 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
1906 {
1907 	/* Move to the end of old queue list */
1908 	STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
1909 	    flowq, fq_actlink);
1910 	fq->fq_flags &= ~FQF_NEW_FLOW;
1911 	fq_cl->fcl_stat.fcl_newflows_cnt--;
1912 
1913 	STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
1914 	fq->fq_flags |= FQF_OLD_FLOW;
1915 	fq_cl->fcl_stat.fcl_oldflows_cnt++;
1916 }
1917 
1918 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)1919 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
1920 {
1921 	fq_t *fq = fqs->fqs_large_flow;
1922 	fq_if_classq_t *fq_cl;
1923 	pktsched_pkt_t pkt;
1924 	volatile uint32_t *pkt_flags;
1925 	uint64_t *pkt_timestamp;
1926 
1927 	if (fq == NULL) {
1928 		return;
1929 	}
1930 	/* queue can not be empty on the largest flow */
1931 	VERIFY(!fq_empty(fq, fqs->fqs_ptype));
1932 
1933 	fq_cl = &FQ_CLASSQ(fq);
1934 	_PKTSCHED_PKT_INIT(&pkt);
1935 	fq_getq_flow_internal(fqs, fq, &pkt);
1936 	ASSERT(pkt.pktsched_ptype != QP_INVALID);
1937 
1938 	pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
1939 	    NULL, NULL, NULL);
1940 
1941 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1942 	*pkt_timestamp = 0;
1943 	switch (pkt.pktsched_ptype) {
1944 	case QP_MBUF:
1945 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
1946 		break;
1947 #if SKYWALK
1948 	case QP_PACKET:
1949 		/* sanity check */
1950 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
1951 		break;
1952 #endif /* SKYWALK */
1953 	default:
1954 		VERIFY(0);
1955 		/* NOTREACHED */
1956 		__builtin_unreachable();
1957 	}
1958 
1959 	if (fq_empty(fq, fqs->fqs_ptype)) {
1960 		fqs->fqs_large_flow = NULL;
1961 		if (fq->fq_flags & FQF_OLD_FLOW) {
1962 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1963 		} else {
1964 			VERIFY(fq->fq_flags & FQF_NEW_FLOW);
1965 			fq_if_empty_new_flow(fq, fq_cl);
1966 		}
1967 	}
1968 	IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
1969 
1970 	pktsched_free_pkt(&pkt);
1971 	fq_cl->fcl_stat.fcl_drop_overflow++;
1972 }
1973 
1974 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)1975 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
1976 {
1977 	fq_t *prev_fq;
1978 
1979 	if (fqs->fqs_large_flow != NULL &&
1980 	    fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1981 		fqs->fqs_large_flow = NULL;
1982 	}
1983 
1984 	if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1985 		return;
1986 	}
1987 
1988 	prev_fq = fqs->fqs_large_flow;
1989 	if (prev_fq == NULL) {
1990 		if (!fq_empty(fq, fqs->fqs_ptype)) {
1991 			fqs->fqs_large_flow = fq;
1992 		}
1993 		return;
1994 	} else if (fq->fq_bytes > prev_fq->fq_bytes) {
1995 		fqs->fqs_large_flow = fq;
1996 	}
1997 }
1998 
1999 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)2000 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2001     fq_t *fq, fq_if_classq_t *fq_cl)
2002 {
2003 	struct flowadv_fcentry *fce;
2004 
2005 #if DEBUG || DEVELOPMENT
2006 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2007 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2008 		return TRUE;
2009 	}
2010 #endif /* DEBUG || DEVELOPMENT */
2011 
2012 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2013 		if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2014 		    fce->fce_flowid == fq->fq_flowhash) {
2015 			/* Already on flowcontrol list */
2016 			return TRUE;
2017 		}
2018 	}
2019 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2020 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2021 	if (fce != NULL) {
2022 		/* XXX Add number of bytes in the queue */
2023 		STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2024 		fq_cl->fcl_stat.fcl_flow_control++;
2025 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2026 		    "flow: 0x%x, iface: %s, B:%u\n", __func__,
2027 		    fq_cl->fcl_stat.fcl_flow_control,
2028 		    fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2029 		    if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2030 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2031 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2032 		    fq->fq_bytes, fq->fq_min_qdelay);
2033 	}
2034 	return (fce != NULL) ? TRUE : FALSE;
2035 }
2036 
2037 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)2038 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2039 {
2040 	STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2041 	STAILQ_NEXT(fce, fce_link) = NULL;
2042 	flowadv_add_entry(fce);
2043 }
2044 
2045 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)2046 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2047 {
2048 	struct flowadv_fcentry *fce = NULL;
2049 
2050 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2051 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2052 		if (fce->fce_flowid == fq->fq_flowhash) {
2053 			break;
2054 		}
2055 	}
2056 	if (fce != NULL) {
2057 		fq_cl->fcl_stat.fcl_flow_feedback++;
2058 		fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2059 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2060 		    "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2061 		    fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2062 		    fce->fce_flowsrc_type, fce->fce_flowid,
2063 		    if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2064 		    fq->fq_bytes);
2065 		fq_if_remove_fcentry(fqs, fce);
2066 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2067 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2068 		    fq->fq_bytes, fq->fq_min_qdelay);
2069 	}
2070 	fq->fq_flags &= ~FQF_FLOWCTL_ON;
2071 }
2072 
2073 boolean_t
fq_if_report_ce(fq_if_t * fqs,pktsched_pkt_t * pkt,uint32_t ce_cnt,uint32_t pkt_cnt)2074 fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt,
2075     uint32_t pkt_cnt)
2076 {
2077 	struct flowadv_fcentry *fce;
2078 
2079 #if DEBUG || DEVELOPMENT
2080 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2081 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2082 		return TRUE;
2083 	}
2084 #endif /* DEBUG || DEVELOPMENT */
2085 
2086 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2087 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2088 	if (fce != NULL) {
2089 		fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2090 		fce->fce_ce_cnt = ce_cnt;
2091 		fce->fce_pkts_since_last_report = pkt_cnt;
2092 
2093 		flowadv_add_entry(fce);
2094 	}
2095 	return (fce != NULL) ? TRUE : FALSE;
2096 }
2097 
2098 
2099 void
fq_if_dequeue(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now,bool * fq_cl_paced,uint64_t * next_tx_time)2100 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2101     int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2102     uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2103     bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2104     uint64_t *next_tx_time)
2105 {
2106 	fq_t *fq = NULL, *tfq = NULL;
2107 	flowq_stailq_t temp_stailq;
2108 	uint32_t pktcnt, bytecnt;
2109 	boolean_t qempty, limit_reached = FALSE;
2110 	bool all_paced = true;
2111 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2112 	fq_getq_flow_t fq_getq_flow_fn;
2113 	classq_pkt_t *head, *tail;
2114 	uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2115 
2116 	switch (fqs->fqs_ptype) {
2117 	case QP_MBUF:
2118 		fq_getq_flow_fn = fq_getq_flow_mbuf;
2119 		break;
2120 
2121 #if SKYWALK
2122 	case QP_PACKET:
2123 		fq_getq_flow_fn = fq_getq_flow_kpkt;
2124 		break;
2125 #endif /* SKYWALK */
2126 
2127 	default:
2128 		VERIFY(0);
2129 		/* NOTREACHED */
2130 		__builtin_unreachable();
2131 	}
2132 
2133 	/*
2134 	 * maximum byte limit should not be greater than the budget for
2135 	 * this class
2136 	 */
2137 	if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2138 		bytelimit = fq_cl->fcl_budget;
2139 	}
2140 
2141 	VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2142 	pktcnt = bytecnt = 0;
2143 	STAILQ_INIT(&temp_stailq);
2144 
2145 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2146 		ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2147 		    FQF_NEW_FLOW);
2148 		uint64_t fq_tx_time;
2149 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2150 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2151 			if (fq_tx_time < fq_cl_tx_time) {
2152 				fq_cl_tx_time = fq_tx_time;
2153 			}
2154 			continue;
2155 		}
2156 		all_paced = false;
2157 
2158 		if (fq_dqlist != NULL) {
2159 			if (!fq->fq_in_dqlist) {
2160 				fq_dqlist_add(fq_dqlist, fq);
2161 			}
2162 			head = &fq->fq_dq_head;
2163 			tail = &fq->fq_dq_tail;
2164 		} else {
2165 			ASSERT(!fq->fq_in_dqlist);
2166 			head = top;
2167 			tail = &last;
2168 		}
2169 
2170 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2171 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2172 
2173 		/*
2174 		 * From RFC 8290:
2175 		 * if that queue has a negative number of credits (i.e., it has already
2176 		 * dequeued at least a quantum of bytes), it is given an additional
2177 		 * quantum of credits, the queue is put onto _the end of_ the list of
2178 		 * old queues, and the routine selects the next queue and starts again.
2179 		 */
2180 		if (fq->fq_deficit <= 0 || qempty) {
2181 			fq->fq_deficit += fq_cl->fcl_quantum;
2182 			fq_if_empty_new_flow(fq, fq_cl);
2183 		}
2184 		//TODO: add credit when it's now paced? so that the fq is trated the same as empty
2185 
2186 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2187 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2188 			if (fq_tx_time < fq_cl_tx_time) {
2189 				fq_cl_tx_time = fq_tx_time;
2190 			}
2191 		}
2192 
2193 		if (limit_reached) {
2194 			goto done;
2195 		}
2196 	}
2197 
2198 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2199 		VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2200 		    FQF_OLD_FLOW);
2201 		bool destroy = true;
2202 		uint64_t fq_tx_time;
2203 
2204 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2205 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2206 			if (fq_tx_time < fq_cl_tx_time) {
2207 				fq_cl_tx_time = fq_tx_time;
2208 			}
2209 			continue;
2210 		}
2211 		all_paced = false;
2212 
2213 		if (fq_dqlist != NULL) {
2214 			if (!fq->fq_in_dqlist) {
2215 				fq_dqlist_add(fq_dqlist, fq);
2216 			}
2217 			head = &fq->fq_dq_head;
2218 			tail = &fq->fq_dq_tail;
2219 			destroy = false;
2220 		} else {
2221 			ASSERT(!fq->fq_in_dqlist);
2222 			head = top;
2223 			tail = &last;
2224 		}
2225 
2226 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2227 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2228 
2229 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2230 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2231 			if (fq_tx_time < fq_cl_tx_time) {
2232 				fq_cl_tx_time = fq_tx_time;
2233 			}
2234 		}
2235 
2236 		if (qempty) {
2237 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2238 		} else if (fq->fq_deficit <= 0) {
2239 			STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2240 			    flowq, fq_actlink);
2241 			/*
2242 			 * Move to the end of the old queues list. We do not
2243 			 * need to update the flow count since this flow
2244 			 * will be added to the tail again
2245 			 */
2246 			STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2247 			fq->fq_deficit += fq_cl->fcl_quantum;
2248 		}
2249 		if (limit_reached) {
2250 			break;
2251 		}
2252 	}
2253 
2254 done:
2255 	if (all_paced) {
2256 		fq_cl->fcl_flags |= FCL_PACED;
2257 		fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2258 	}
2259 	if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2260 		STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2261 	} else if (!STAILQ_EMPTY(&temp_stailq)) {
2262 		fq_cl->fcl_old_flows = temp_stailq;
2263 	}
2264 	if (last.cp_mbuf != NULL) {
2265 		VERIFY(top->cp_mbuf != NULL);
2266 		if (bottom != NULL) {
2267 			*bottom = last;
2268 		}
2269 	}
2270 	if (retpktcnt != NULL) {
2271 		*retpktcnt = pktcnt;
2272 	}
2273 	if (retbytecnt != NULL) {
2274 		*retbytecnt = bytecnt;
2275 	}
2276 	if (fq_cl_paced != NULL) {
2277 		*fq_cl_paced = all_paced;
2278 	}
2279 	if (next_tx_time != NULL) {
2280 		*next_tx_time = fq_cl_tx_time;
2281 	}
2282 }
2283 
2284 void
fq_if_teardown_ifclassq(struct ifclassq * ifq)2285 fq_if_teardown_ifclassq(struct ifclassq *ifq)
2286 {
2287 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2288 
2289 	IFCQ_LOCK_ASSERT_HELD(ifq);
2290 	VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL);
2291 	fq_if_destroy(fqs);
2292 	ifq->ifcq_disc = NULL;
2293 	ifclassq_detach(ifq);
2294 }
2295 
2296 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)2297 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2298     struct fq_codel_flowstats *flowstat)
2299 {
2300 	bzero(flowstat, sizeof(*flowstat));
2301 	flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2302 	flowstat->fqst_bytes = fq->fq_bytes;
2303 	flowstat->fqst_flowhash = fq->fq_flowhash;
2304 	if (fq->fq_flags & FQF_NEW_FLOW) {
2305 		flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2306 	}
2307 	if (fq->fq_flags & FQF_OLD_FLOW) {
2308 		flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2309 	}
2310 	if (fq->fq_flags & FQF_DELAY_HIGH) {
2311 		flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2312 	}
2313 	if (fq->fq_flags & FQF_FLOWCTL_ON) {
2314 		flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2315 	}
2316 	if (fqs->fqs_large_flow == fq) {
2317 		flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2318 	}
2319 }
2320 
2321 int
fq_if_getqstats_ifclassq(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2322 fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2323     struct if_ifclassq_stats *ifqs)
2324 {
2325 	struct fq_codel_classstats *fcls;
2326 	fq_if_classq_t *fq_cl;
2327 	fq_if_t *fqs;
2328 	fq_t *fq = NULL;
2329 	fq_if_group_t *grp;
2330 	u_int32_t i, flowstat_cnt;
2331 
2332 	if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2333 		return EINVAL;
2334 	}
2335 
2336 	fqs = (fq_if_t *)ifq->ifcq_disc;
2337 	if (fqs->fqs_classq_groups[gid] == NULL) {
2338 		return ENXIO;
2339 	}
2340 
2341 	fcls = &ifqs->ifqs_fq_codel_stats;
2342 
2343 	fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2344 	grp = fq_if_find_grp(fqs, gid);
2345 
2346 	fcls->fcls_pri = fq_cl->fcl_pri;
2347 	fcls->fcls_service_class = fq_cl->fcl_service_class;
2348 	fcls->fcls_quantum = fq_cl->fcl_quantum;
2349 	fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2350 	fcls->fcls_budget = fq_cl->fcl_budget;
2351 	fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2352 	fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2353 	fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2354 	fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2355 	fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2356 	fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2357 	fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2358 	fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2359 	fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2360 	fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2361 	fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2362 	fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2363 	fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2364 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2365 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2366 	fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2367 	fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2368 	fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2369 	fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2370 	fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2371 	fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2372 	fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2373 	fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2374 	fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2375 	fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2376 	fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2377 	fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2378 	fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2379 	fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2380 	fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2381 	fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2382 	fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2383 	fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2384 	fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2385 	fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2386 
2387 	/* Gather per flow stats */
2388 	flowstat_cnt = min((fcls->fcls_newflows_cnt +
2389 	    fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2390 	i = 0;
2391 	STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2392 		if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2393 			break;
2394 		}
2395 
2396 		/* leave space for a few old flows */
2397 		if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2398 		    i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2399 			break;
2400 		}
2401 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2402 		i++;
2403 	}
2404 	STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2405 		if (i >= flowstat_cnt) {
2406 			break;
2407 		}
2408 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2409 		i++;
2410 	}
2411 	VERIFY(i <= flowstat_cnt);
2412 	fcls->fcls_flowstats_cnt = i;
2413 	return 0;
2414 }
2415 
2416 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2417 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2418 {
2419 #define _FQ_CLASSQ_INIT(_grp, _s, _q)                      \
2420     fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX,         \
2421 	FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s),     \
2422 	MBUF_SC_ ## _s );
2423 
2424 	fq_if_group_t *grp;
2425 	fq_if_t *fqs;
2426 	uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2427 	struct ifnet *ifp = ifcq->ifcq_ifp;
2428 
2429 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2430 
2431 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2432 
2433 	if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2434 		grp = fqs->fqs_classq_groups[grp_idx];
2435 		goto update;
2436 	}
2437 
2438 	if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2439 		return EINVAL;
2440 	}
2441 
2442 	grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2443 	if (grp == NULL) {
2444 		return ENOMEM;
2445 	}
2446 
2447 	fqs->fqs_classq_groups[grp_idx] = grp;
2448 	grp->fqg_index = grp_idx;
2449 
2450 	quantum = fq_if_calc_quantum(ifp);
2451 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2452 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2453 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2454 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2455 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2456 	} else {
2457 		/* SIG shares same INDEX with VI */
2458 		_CASSERT(SCIDX_SIG == SCIDX_VI);
2459 		_CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2460 
2461 		_FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2462 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2463 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2464 		_FQ_CLASSQ_INIT(grp, RD, quantum);
2465 		_FQ_CLASSQ_INIT(grp, OAM, quantum);
2466 		_FQ_CLASSQ_INIT(grp, AV, quantum);
2467 		_FQ_CLASSQ_INIT(grp, RV, quantum);
2468 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2469 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2470 		_FQ_CLASSQ_INIT(grp, CTL, quantum);
2471 	}
2472 
2473 update:
2474 	if (flags & IF_DEFAULT_GRP) {
2475 		fq_if_set_grp_combined(ifcq, grp_idx);
2476 		grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2477 	} else {
2478 		fq_if_set_grp_separated(ifcq, grp_idx);
2479 		grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2480 	}
2481 
2482 	calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2483 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2484 	    calc_flags);
2485 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2486 	    calc_flags | IF_CLASSQ_L4S);
2487 
2488 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2489 	    calc_flags);
2490 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2491 	    calc_flags | IF_CLASSQ_L4S);
2492 
2493 	return 0;
2494 #undef _FQ_CLASSQ_INIT
2495 }
2496 
2497 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2498 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2499 {
2500 	fq_if_group_t *grp;
2501 
2502 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2503 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2504 
2505 	grp = fqs->fqs_classq_groups[grp_idx];
2506 	VERIFY(grp != NULL);
2507 
2508 	return grp;
2509 }
2510 
2511 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2512 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2513 {
2514 	for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2515 		fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2516 	}
2517 
2518 	bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2519 	grp->fqg_len = 0;
2520 	grp->fqg_bytes = 0;
2521 	fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2522 }
2523 
2524 void
fq_if_destroy_grps(fq_if_t * fqs)2525 fq_if_destroy_grps(fq_if_t *fqs)
2526 {
2527 	fq_if_group_t *grp;
2528 
2529 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2530 
2531 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2532 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2533 			continue;
2534 		}
2535 
2536 		grp = fq_if_find_grp(fqs, grp_idx);
2537 		fq_if_purge_grp(fqs, grp);
2538 		zfree(fq_if_grp_zone, grp);
2539 		fqs->fqs_classq_groups[grp_idx] = NULL;
2540 	}
2541 }
2542 
2543 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2544 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2545 {
2546 	return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2547 }
2548 
2549 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2550 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2551 {
2552 	fq_if_t *fqs;
2553 	fq_if_group_t *grp;
2554 
2555 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2556 
2557 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2558 	grp = fq_if_find_grp(fqs, grp_idx);
2559 
2560 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
2561 		return;
2562 	}
2563 
2564 	/*
2565 	 * We keep the current fq_deficit and fcl_budget when combining a group.
2566 	 * That might disrupt the AQM but only for a moment.
2567 	 */
2568 	pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2569 	TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2570 }
2571 
2572 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2573 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2574 {
2575 	fq_if_t *fqs;
2576 	fq_if_group_t *grp;
2577 
2578 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2579 
2580 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2581 	grp = fq_if_find_grp(fqs, grp_idx);
2582 
2583 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2584 		return;
2585 	}
2586 
2587 	pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2588 	TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2589 }
2590