xref: /xnu-11417.140.69/bsd/net/pktsched/pktsched_fq_codel.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/droptap.h>
36 #include <net/classq/classq.h>
37 #include <net/classq/classq_fq_codel.h>
38 #include <net/pktsched/pktsched_fq_codel.h>
39 #include <os/log.h>
40 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
41 #include <mach/thread_act.h>
42 #include <kern/thread.h>
43 #include <kern/sched_prim.h>
44 
45 #define FQ_CODEL_DEFAULT_QUANTUM 1500
46 
47 #define FQ_CODEL_QUANTUM_BK_SYS(_q)    (_q)
48 #define FQ_CODEL_QUANTUM_BK(_q)        (_q)
49 #define FQ_CODEL_QUANTUM_BE(_q)        (_q)
50 #define FQ_CODEL_QUANTUM_RD(_q)        (_q)
51 #define FQ_CODEL_QUANTUM_OAM(_q)       (_q)
52 #define FQ_CODEL_QUANTUM_AV(_q)        (_q * 2)
53 #define FQ_CODEL_QUANTUM_RV(_q)        (_q * 2)
54 #define FQ_CODEL_QUANTUM_VI(_q)        (_q * 2)
55 #define FQ_CODEL_QUANTUM_VO(_q)        ((_q * 2) / 5)
56 #define FQ_CODEL_QUANTUM_CTL(_q)       ((_q * 2) / 5)
57 
58 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
59 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
60 
61 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
62     0, "FQ-CODEL parameters");
63 
64 SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
65     &ifclassq_enable_pacing, 0, "Enable pacing");
66 
67 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
68 #if (DEVELOPMENT || DEBUG)
69 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
70     CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
71 #endif /* !DEVELOPMENT && !DEBUG */
72 
73 unsigned int ifclassq_enable_pacing = 1;
74 
75 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
76 
77 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
78 static void fq_if_destroy(fq_if_t *fqs);
79 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
80     uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
81 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
82     int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
83     uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
84 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
85 static void fq_if_purge(fq_if_t *);
86 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
87 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
88     uint64_t);
89 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
90 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
91     fq_t *fq, uint64_t now);
92 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
93 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
94     bool purge_all);
95 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
96 static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq,
97     mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
98     classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
99     u_int32_t *retbytecnt, uint8_t grp_idx);
100 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
101     cqrq_stat_sc_t *stat, uint64_t now);
102 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
103 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
104 static void fq_if_destroy_grps(fq_if_t *fqs);
105 
106 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
107 	[FQ_IF_CTL_INDEX]       = 8,
108 	[FQ_IF_VO_INDEX]        = 8,
109 	[FQ_IF_VI_INDEX]        = 6,
110 	[FQ_IF_RV_INDEX]        = 6,
111 	[FQ_IF_AV_INDEX]        = 6,
112 	[FQ_IF_OAM_INDEX]       = 4,
113 	[FQ_IF_RD_INDEX]        = 4,
114 	[FQ_IF_BE_INDEX]        = 4,
115 	[FQ_IF_BK_INDEX]        = 2,
116 	[FQ_IF_BK_SYS_INDEX]    = 2,
117 };
118 
119 #define FQ_CODEL_DRR_MAX(_s)    fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
120 
121 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
122     fq_if_state state);
123 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
124     fq_if_state dst_state, fq_if_state src_state);
125 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
126     fq_if_state state);
127 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
128     fq_if_state state, fq_if_group_t **selected_grp);
129 static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
130     fq_if_state dst_state, fq_if_state src_state);
131 
132 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
133     fq_if_state state);
134 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
135     fq_if_state dst_state, fq_if_state src_state);
136 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
137     fq_if_state state);
138 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
139     fq_if_state state, fq_if_group_t **selected_grp);
140 static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
141     fq_if_state dst_state, fq_if_state src_state);
142 
143 bitmap_ops_t fq_if_grps_bitmap_ops =
144 {
145 	.ffs    = fq_if_grps_bitmap_ffs,
146 	.zeros  = fq_if_grps_bitmap_zeros,
147 	.cpy    = fq_if_grps_bitmap_cpy,
148 	.clr    = fq_if_grps_bitmap_clr,
149 	.move   = fq_if_grps_bitmap_move,
150 };
151 
152 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
153 {
154 	.ffs    = fq_if_grps_sc_bitmap_ffs,
155 	.zeros  = fq_if_grps_sc_bitmap_zeros,
156 	.cpy    = fq_if_grps_sc_bitmap_cpy,
157 	.clr    = fq_if_grps_sc_bitmap_clr,
158 	.move   = fq_if_grps_sc_bitmap_move,
159 };
160 
161 static uint32_t fq_if_hash_table_size;
162 
163 extern int serverperfmode; // Temporary to resolve build dependency
164 
165 void
pktsched_fq_init(void)166 pktsched_fq_init(void)
167 {
168 	PE_parse_boot_argn("ifclassq_enable_pacing", &ifclassq_enable_pacing,
169 	    sizeof(ifclassq_enable_pacing));
170 
171 	if (serverperfmode) {
172 		fq_if_hash_table_size = (1 << 16);
173 	} else {
174 		fq_if_hash_table_size = (1 << 8);
175 	}
176 
177 	// format looks like ifcq_drr_max=8,8,6
178 	char buf[(FQ_IF_MAX_CLASSES) * 3];
179 	size_t i, len, pri_index = 0;
180 	uint32_t drr = 0;
181 	if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
182 		return;
183 	}
184 
185 	len = strbuflen(buf, sizeof(buf));
186 	for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
187 		if (buf[i] != ',' && buf[i] != '\0') {
188 			VERIFY(buf[i] >= '0' && buf[i] <= '9');
189 			drr = drr * 10 + buf[i] - '0';
190 			continue;
191 		}
192 		fq_codel_drr_max_values[pri_index] = drr;
193 		pri_index += 1;
194 		drr = 0;
195 	}
196 }
197 
198 static uint32_t
fq_if_flow_hash_id(uint32_t flowid)199 fq_if_flow_hash_id(uint32_t flowid)
200 {
201 	return flowid & (fq_if_hash_table_size - 1);
202 }
203 
204 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
205 	(STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
206 	STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
207 
208 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
209 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
210     int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
211     uint32_t *, boolean_t *, uint64_t);
212 
213 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)214 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
215 {
216 	pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
217 }
218 
219 static inline uint64_t
fq_codel_get_time(void)220 fq_codel_get_time(void)
221 {
222 	struct timespec ts;
223 	uint64_t now;
224 
225 	nanouptime(&ts);
226 	now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
227 	return now;
228 }
229 
230 #if SKYWALK
231 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)232 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
233 {
234 	pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
235 }
236 #endif /* SKYWALK */
237 
238 #if SKYWALK
239 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)240 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
241     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
242     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
243     boolean_t *qempty, uint64_t now)
244 {
245 	uint32_t plen;
246 	pktsched_pkt_t pkt;
247 	boolean_t limit_reached = FALSE;
248 	struct ifclassq *ifq = fqs->fqs_ifq;
249 	struct ifnet *ifp = ifq->ifcq_ifp;
250 
251 	/*
252 	 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
253 	 * all common flags need to be declared in that mask.
254 	 */
255 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
256 	    !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
257 		_PKTSCHED_PKT_INIT(&pkt);
258 		fq_getq_flow(fqs, fq, &pkt, now);
259 		ASSERT(pkt.pktsched_ptype == QP_PACKET);
260 
261 		plen = pktsched_get_pkt_len(&pkt);
262 		fq->fq_deficit -= plen;
263 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
264 			pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
265 			fq->fq_flags &= ~FQF_FRESH_FLOW;
266 		}
267 
268 		if (head->cp_kpkt == NULL) {
269 			*head = pkt.pktsched_pkt;
270 		} else {
271 			ASSERT(tail->cp_kpkt != NULL);
272 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
273 			tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
274 		}
275 		*tail = pkt.pktsched_pkt;
276 		tail->cp_kpkt->pkt_nextpkt = NULL;
277 		fq_cl->fcl_stat.fcl_dequeue++;
278 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
279 		*pkt_cnt += 1;
280 		*byte_cnt += plen;
281 
282 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
283 
284 		/* Check if the limit is reached */
285 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
286 			limit_reached = TRUE;
287 		}
288 	}
289 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
290 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
291 	    fq->fq_bytes, fq->fq_min_qdelay);
292 
293 	*qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
294 	return limit_reached;
295 }
296 #endif /* SKYWALK */
297 
298 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)299 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
300     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
301     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
302     boolean_t *qempty, uint64_t now)
303 {
304 	u_int32_t plen;
305 	pktsched_pkt_t pkt;
306 	boolean_t limit_reached = FALSE;
307 	struct ifclassq *ifq = fqs->fqs_ifq;
308 	struct ifnet *ifp = ifq->ifcq_ifp;
309 
310 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
311 	    !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
312 		_PKTSCHED_PKT_INIT(&pkt);
313 		fq_getq_flow(fqs, fq, &pkt, now);
314 		ASSERT(pkt.pktsched_ptype == QP_MBUF);
315 
316 		plen = pktsched_get_pkt_len(&pkt);
317 		fq->fq_deficit -= plen;
318 
319 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
320 			pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
321 			fq->fq_flags &= ~FQF_FRESH_FLOW;
322 		}
323 
324 		if (head->cp_mbuf == NULL) {
325 			*head = pkt.pktsched_pkt;
326 		} else {
327 			ASSERT(tail->cp_mbuf != NULL);
328 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
329 			tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
330 		}
331 		*tail = pkt.pktsched_pkt;
332 		tail->cp_mbuf->m_nextpkt = NULL;
333 		fq_cl->fcl_stat.fcl_dequeue++;
334 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
335 		*pkt_cnt += 1;
336 		*byte_cnt += plen;
337 
338 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
339 
340 		/* Check if the limit is reached */
341 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
342 			limit_reached = TRUE;
343 		}
344 	}
345 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
346 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
347 	    fq->fq_bytes, fq->fq_min_qdelay);
348 
349 	*qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
350 	return limit_reached;
351 }
352 
353 static void
fq_if_pacemaker_tcall(thread_call_param_t arg0,thread_call_param_t arg1)354 fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
355 {
356 #pragma unused(arg1)
357 	struct ifnet* ifp = (struct ifnet*)arg0;
358 	ASSERT(ifp != NULL);
359 
360 	ifnet_start_ignore_delay(ifp);
361 }
362 
363 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)364 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
365 {
366 	flowq_list_t *fqs_flows;
367 	fq_if_t *fqs;
368 
369 	ASSERT(ifq->ifcq_ifp != NULL);
370 	fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
371 	if (fqs == NULL) {
372 		return NULL;
373 	}
374 	fqs_flows = kalloc_type(flowq_list_t, fq_if_hash_table_size, Z_WAITOK | Z_ZERO);
375 	if (fqs_flows == NULL) {
376 		zfree(fq_if_zone, fqs);
377 		return NULL;
378 	}
379 	fqs->fqs_flows = fqs_flows;
380 	fqs->fqs_flows_count = fq_if_hash_table_size;
381 	fqs->fqs_ifq = ifq;
382 	fqs->fqs_ptype = ptype;
383 
384 	/* Configure packet drop limit across all queues */
385 	fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
386 	STAILQ_INIT(&fqs->fqs_fclist);
387 	TAILQ_INIT(&fqs->fqs_empty_list);
388 	TAILQ_INIT(&fqs->fqs_combined_grp_list);
389 	fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall,
390 	    (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL,
391 	    THREAD_CALL_OPTIONS_ONCE);
392 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
393 
394 	return fqs;
395 }
396 
397 void
fq_if_destroy(fq_if_t * fqs)398 fq_if_destroy(fq_if_t *fqs)
399 {
400 	struct ifnet    *ifp = fqs->fqs_ifq->ifcq_ifp;
401 	thread_call_t   __single tcall = fqs->fqs_pacemaker_tcall;
402 
403 	VERIFY(ifp != NULL);
404 	ASSERT(tcall != NULL);
405 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
406 	LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
407 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
408 
409 	/*
410 	 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
411 	 * and schedule a pacemaker call. So we do not need a sleep wait loop here
412 	 * cancel wait and free should succeed in one call.
413 	 */
414 	thread_call_cancel_wait(tcall);
415 	ASSERT(thread_call_free(tcall));
416 
417 	fq_if_purge(fqs);
418 	fq_if_destroy_grps(fqs);
419 
420 	fqs->fqs_ifq = NULL;
421 
422 	kfree_type_counted_by(flowq_list_t, fqs->fqs_flows_count, fqs->fqs_flows);
423 	zfree(fq_if_zone, fqs);
424 }
425 
426 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)427 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
428 {
429 	uint8_t pri;
430 
431 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
432 		switch (svc) {
433 		case MBUF_SC_BK_SYS:
434 		case MBUF_SC_BK:
435 			pri = FQ_IF_BK_INDEX;
436 			break;
437 		case MBUF_SC_BE:
438 		case MBUF_SC_RD:
439 		case MBUF_SC_OAM:
440 			pri = FQ_IF_BE_INDEX;
441 			break;
442 		case MBUF_SC_AV:
443 		case MBUF_SC_RV:
444 		case MBUF_SC_VI:
445 		case MBUF_SC_SIG:
446 			pri = FQ_IF_VI_INDEX;
447 			break;
448 		case MBUF_SC_VO:
449 		case MBUF_SC_CTL:
450 			pri = FQ_IF_VO_INDEX;
451 			break;
452 		default:
453 			pri = FQ_IF_BE_INDEX; /* Use best effort by default */
454 			break;
455 		}
456 		return pri;
457 	}
458 
459 	/* scheduler is not managed by the driver */
460 	switch (svc) {
461 	case MBUF_SC_BK_SYS:
462 		pri = FQ_IF_BK_SYS_INDEX;
463 		break;
464 	case MBUF_SC_BK:
465 		pri = FQ_IF_BK_INDEX;
466 		break;
467 	case MBUF_SC_BE:
468 		pri = FQ_IF_BE_INDEX;
469 		break;
470 	case MBUF_SC_RD:
471 		pri = FQ_IF_RD_INDEX;
472 		break;
473 	case MBUF_SC_OAM:
474 		pri = FQ_IF_OAM_INDEX;
475 		break;
476 	case MBUF_SC_AV:
477 		pri = FQ_IF_AV_INDEX;
478 		break;
479 	case MBUF_SC_RV:
480 		pri = FQ_IF_RV_INDEX;
481 		break;
482 	case MBUF_SC_VI:
483 		pri = FQ_IF_VI_INDEX;
484 		break;
485 	case MBUF_SC_SIG:
486 		pri = FQ_IF_SIG_INDEX;
487 		break;
488 	case MBUF_SC_VO:
489 		pri = FQ_IF_VO_INDEX;
490 		break;
491 	case MBUF_SC_CTL:
492 		pri = FQ_IF_CTL_INDEX;
493 		break;
494 	default:
495 		pri = FQ_IF_BE_INDEX; /* Use best effort by default */
496 		break;
497 	}
498 	return pri;
499 }
500 
501 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)502 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
503     uint32_t drr_max, uint32_t svc_class)
504 {
505 	fq_if_classq_t *fq_cl;
506 	VERIFY(pri < FQ_IF_MAX_CLASSES);
507 	fq_cl = &fqg->fqg_classq[pri];
508 
509 	VERIFY(fq_cl->fcl_quantum == 0);
510 	VERIFY(quantum != 0);
511 	fq_cl->fcl_quantum = quantum;
512 	fq_cl->fcl_pri = pri;
513 	fq_cl->fcl_drr_max = drr_max;
514 	fq_cl->fcl_service_class = svc_class;
515 	fq_cl->fcl_next_tx_time = 0;
516 	fq_cl->fcl_flags = 0;
517 	STAILQ_INIT(&fq_cl->fcl_new_flows);
518 	STAILQ_INIT(&fq_cl->fcl_old_flows);
519 }
520 
521 int
fq_if_enqueue_classq(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)522 fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
523     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
524 {
525 	uint8_t pri, grp_idx = 0;
526 	fq_if_t *fqs;
527 	fq_if_classq_t *fq_cl;
528 	fq_if_group_t *fq_group;
529 	int ret;
530 	mbuf_svc_class_t svc;
531 	pktsched_pkt_t pkt;
532 
533 	pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
534 
535 	fqs = (fq_if_t *)ifq->ifcq_disc;
536 	svc = pktsched_get_pkt_svc(&pkt);
537 #if SKYWALK
538 	if (head->cp_ptype == QP_PACKET) {
539 		grp_idx = head->cp_kpkt->pkt_qset_idx;
540 	}
541 #endif /* SKYWALK */
542 	pri = fq_if_service_to_priority(fqs, svc);
543 	VERIFY(pri < FQ_IF_MAX_CLASSES);
544 
545 	IFCQ_LOCK_SPIN(ifq);
546 	fq_group = fq_if_find_grp(fqs, grp_idx);
547 	fq_cl = &fq_group->fqg_classq[pri];
548 
549 	if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
550 		IFCQ_UNLOCK(ifq);
551 		/* BK_SYS is currently throttled */
552 		os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
553 		if (__improbable(droptap_verbose > 0)) {
554 			pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
555 			    __func__, __LINE__, 0);
556 		} else {
557 			pktsched_free_pkt(&pkt);
558 		}
559 		*pdrop = TRUE;
560 		ret = EQSUSPENDED;
561 		goto done;
562 	}
563 
564 	ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
565 	ret = fq_addq(fqs, fq_group, &pkt, fq_cl);
566 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
567 		if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
568 		    (1 << pri)) == 0) {
569 			/*
570 			 * this group is not in ER or EB groups,
571 			 * mark it as IB
572 			 */
573 			pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
574 		}
575 	}
576 
577 	if (__improbable(ret != 0)) {
578 		if (ret == CLASSQEQ_SUCCESS_FC) {
579 			/* packet enqueued, return advisory feedback */
580 			ret = EQFULL;
581 			*pdrop = FALSE;
582 		} else if (ret == CLASSQEQ_COMPRESSED) {
583 			ret = 0;
584 			*pdrop = FALSE;
585 		} else {
586 			IFCQ_UNLOCK(ifq);
587 			*pdrop = TRUE;
588 			pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_FULL,
589 			    __func__, __LINE__, 0);
590 			switch (ret) {
591 			case CLASSQEQ_DROP:
592 				ret = ENOBUFS;
593 				goto done;
594 			case CLASSQEQ_DROP_FC:
595 				ret = EQFULL;
596 				goto done;
597 			case CLASSQEQ_DROP_SP:
598 				ret = EQSUSPENDED;
599 				goto done;
600 			default:
601 				VERIFY(0);
602 				/* NOTREACHED */
603 				__builtin_unreachable();
604 			}
605 			/* NOTREACHED */
606 			__builtin_unreachable();
607 		}
608 	} else {
609 		*pdrop = FALSE;
610 	}
611 	IFCQ_ADD_LEN(ifq, cnt);
612 	IFCQ_INC_BYTES(ifq, bytes);
613 
614 
615 	FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
616 	FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
617 
618 	IFCQ_UNLOCK(ifq);
619 done:
620 #if DEBUG || DEVELOPMENT
621 	if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
622 		ret = 0;
623 	}
624 #endif /* DEBUG || DEVELOPMENT */
625 	return ret;
626 }
627 
628 void
fq_if_dequeue_classq(struct ifclassq * ifq,classq_pkt_t * pkt,uint8_t grp_idx)629 fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx)
630 {
631 	(void) fq_if_dequeue_classq_multi(ifq, 1,
632 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
633 }
634 
635 void
fq_if_dequeue_sc_classq(struct ifclassq * ifq,mbuf_svc_class_t svc,classq_pkt_t * pkt,uint8_t grp_idx)636 fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
637     classq_pkt_t *pkt, uint8_t grp_idx)
638 {
639 	(void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1,
640 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
641 }
642 
643 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)644 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
645 {
646 	ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
647 	ASSERT(!fq->fq_in_dqlist);
648 	STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
649 	fq->fq_in_dqlist = true;
650 }
651 
652 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)653 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
654     classq_pkt_t *tail, classq_pkt_type_t ptype)
655 {
656 	ASSERT(fq->fq_in_dqlist);
657 	if (fq->fq_dq_head.cp_mbuf == NULL) {
658 		goto done;
659 	}
660 
661 	if (head->cp_mbuf == NULL) {
662 		*head = fq->fq_dq_head;
663 	} else {
664 		ASSERT(tail->cp_mbuf != NULL);
665 
666 		switch (ptype) {
667 		case QP_MBUF:
668 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
669 			tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
670 			ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
671 			break;
672 #if SKYWALK
673 		case QP_PACKET:
674 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
675 			tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
676 			ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
677 			break;
678 #endif /* SKYWALK */
679 		default:
680 			VERIFY(0);
681 			/* NOTREACHED */
682 			__builtin_unreachable();
683 		}
684 	}
685 	*tail = fq->fq_dq_tail;
686 done:
687 	STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
688 	CLASSQ_PKT_INIT(&fq->fq_dq_head);
689 	CLASSQ_PKT_INIT(&fq->fq_dq_tail);
690 	fq->fq_in_dqlist = false;
691 }
692 
693 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)694 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
695     classq_pkt_t *tail, classq_pkt_type_t ptype)
696 {
697 	fq_t *fq, *tfq;
698 
699 	STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
700 		fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
701 	}
702 }
703 
704 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)705 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
706     fq_if_group_t **selected_grp)
707 {
708 	#pragma unused(pri)
709 
710 	fq_if_group_t *grp;
711 	uint32_t highest_pri = FQ_IF_MAX_CLASSES;
712 	int ret_pri = 0;
713 
714 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
715 		uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
716 		/* bitmap is empty in this case */
717 		if (cur_pri == 0) {
718 			continue;
719 		}
720 		if (cur_pri <= highest_pri) {
721 			highest_pri = cur_pri;
722 			ret_pri = cur_pri;
723 			*selected_grp = grp;
724 		}
725 	}
726 	return ret_pri;
727 }
728 
729 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)730 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
731 {
732     #pragma unused(pri)
733 
734 	fq_if_group_t *grp;
735 
736 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
737 		if (grp->fqg_bitmaps[state] != 0) {
738 			return FALSE;
739 		}
740 	}
741 	return TRUE;
742 }
743 
744 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)745 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
746     fq_if_state src_state)
747 {
748     #pragma unused(pri)
749 
750 	fq_if_group_t *grp;
751 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
752 		grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
753 	}
754 }
755 
756 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)757 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
758 {
759     #pragma unused(pri)
760 
761 	fq_if_group_t *grp;
762 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
763 		grp->fqg_bitmaps[state] = 0;
764 	}
765 }
766 
767 static void
fq_if_grps_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)768 fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
769     fq_if_state src_state)
770 {
771     #pragma unused(pri)
772 
773 	fq_if_group_t *grp;
774 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
775 		grp->fqg_bitmaps[dst_state] =
776 		    grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
777 		grp->fqg_bitmaps[src_state] = 0;
778 	}
779 }
780 
781 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)782 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
783     fq_if_group_t **selected_grp)
784 {
785 	fq_if_group_t *grp;
786 	int ret_pri = 0;
787 
788 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
789 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
790 			/* +1 to match the semantics of pktsched_ffs */
791 			ret_pri = pri + 1;
792 			*selected_grp = grp;
793 			break;
794 		}
795 	}
796 
797 	return ret_pri;
798 }
799 
800 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)801 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
802 {
803 	fq_if_group_t *grp;
804 
805 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
806 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
807 			return FALSE;
808 		}
809 	}
810 	return TRUE;
811 }
812 
813 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)814 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
815     fq_if_state src_state)
816 {
817 	fq_if_group_t *grp;
818 
819 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
820 		pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
821 		    &grp->fqg_bitmaps[src_state]);
822 	}
823 }
824 
825 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)826 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
827 {
828 	fq_if_group_t *grp;
829 
830 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
831 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
832 	}
833 }
834 
835 static void
fq_if_grps_sc_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)836 fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
837     fq_if_state src_state)
838 {
839 	fq_if_group_t *grp;
840 
841 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
842 		pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state],
843 		    &grp->fqg_bitmaps[src_state]);
844 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]);
845 	}
846 }
847 
848 /*
849  * Pacemaker is only scheduled when no packet can be dequeued from AQM
850  * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
851  * This only applies to L4S traffic at this moment.
852  */
853 static void
fq_if_schedule_pacemaker(fq_if_t * fqs,uint64_t now,uint64_t next_tx_time)854 fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
855 {
856 	uint64_t deadline = 0;
857 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
858 		return;
859 	}
860 	ASSERT(next_tx_time != FQ_INVALID_TX_TS);
861 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
862 	ASSERT(now < next_tx_time);
863 
864 	DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
865 	    uint64_t, next_tx_time - now);
866 	KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
867 	    next_tx_time, next_tx_time - now);
868 
869 	clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline);
870 	thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline);
871 }
872 
873 static int
fq_if_dequeue_classq_multi_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)874 fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
875     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
876     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
877     uint8_t grp_idx)
878 {
879 	uint32_t total_pktcnt = 0, total_bytecnt = 0;
880 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
881 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
882 	classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
883 	fq_if_append_pkt_t append_pkt;
884 	flowq_dqlist_t fq_dqlist_head;
885 	fq_if_classq_t *fq_cl;
886 	fq_grp_tailq_t *grp_list, tmp_grp_list;
887 	fq_if_group_t *__single fq_grp = NULL;
888 	fq_if_t *fqs;
889 	uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
890 	int pri = 0, svc_pri = 0;
891 	bool all_paced = true;
892 
893 	IFCQ_LOCK_ASSERT_HELD(ifq);
894 
895 	fqs = (fq_if_t *)ifq->ifcq_disc;
896 	STAILQ_INIT(&fq_dqlist_head);
897 
898 	switch (fqs->fqs_ptype) {
899 	case QP_MBUF:
900 		append_pkt = fq_if_append_mbuf;
901 		break;
902 
903 #if SKYWALK
904 	case QP_PACKET:
905 		append_pkt = fq_if_append_pkt;
906 		break;
907 #endif /* SKYWALK */
908 
909 	default:
910 		VERIFY(0);
911 		/* NOTREACHED */
912 		__builtin_unreachable();
913 	}
914 
915 	now = fq_codel_get_time();
916 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
917 		svc_pri = fq_if_service_to_priority(fqs, svc);
918 	} else {
919 		VERIFY(svc == MBUF_SC_UNSPEC);
920 	}
921 
922 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
923 		grp_list = &fqs->fqs_combined_grp_list;
924 		VERIFY(!TAILQ_EMPTY(grp_list));
925 	} else {
926 		grp_list = &tmp_grp_list;
927 		fq_grp = fq_if_find_grp(fqs, grp_idx);
928 		TAILQ_INIT(grp_list);
929 		TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
930 	}
931 
932 	for (;;) {
933 		uint32_t pktcnt = 0, bytecnt = 0;
934 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
935 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
936 		bool fq_cl_all_paced = false;
937 		uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
938 
939 		if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
940 		    fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
941 			fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
942 			fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
943 			if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
944 				if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
945 					/*
946 					 * Move fq_cl in IR back to ER, so that they will inspected with priority
947 					 * the next time the driver dequeues
948 					 */
949 					fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
950 					fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
951 				}
952 				break;
953 			}
954 		}
955 		pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
956 		if (pri == 0) {
957 			/*
958 			 * There are no ER flows, move the highest
959 			 * priority one from EB if there are any in that
960 			 * category
961 			 */
962 			pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
963 			VERIFY(pri > 0);
964 			VERIFY(fq_grp != NULL);
965 			pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
966 			pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
967 		}
968 		VERIFY(fq_grp != NULL);
969 		pri--; /* index starts at 0 */
970 		fq_cl = &fq_grp->fqg_classq[pri];
971 
972 		if (fq_cl->fcl_budget <= 0) {
973 			/* Update the budget */
974 			fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
975 			    fq_cl->fcl_stat.fcl_flows_cnt) *
976 			    fq_cl->fcl_quantum);
977 			if (fq_cl->fcl_budget <= 0) {
978 				goto state_change;
979 			}
980 		}
981 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
982 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
983 		    &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
984 		    &fq_cl_next_tx_time);
985 		if (head.cp_mbuf != NULL) {
986 			ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
987 			if (first.cp_mbuf == NULL) {
988 				first = head;
989 			} else {
990 				ASSERT(last.cp_mbuf != NULL);
991 				append_pkt(&last, &head);
992 			}
993 			last = tail;
994 			append_pkt(&last, &tmp);
995 		}
996 		if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
997 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
998 			next_tx_time = fq_cl_next_tx_time;
999 		}
1000 		fq_cl->fcl_budget -= bytecnt;
1001 		total_pktcnt += pktcnt;
1002 		total_bytecnt += bytecnt;
1003 
1004 		/*
1005 		 * If the class has exceeded the budget but still has data
1006 		 * to send, move it to IB
1007 		 */
1008 state_change:
1009 		VERIFY(fq_grp != NULL);
1010 		all_paced &= fq_cl_all_paced;
1011 		if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1012 			if (fq_cl->fcl_budget <= 0) {
1013 				pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1014 				pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1015 			} else if (fq_cl_all_paced) {
1016 				if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1017 					/*
1018 					 * If a fq_cl still has budget but only paced queues, park it
1019 					 * to IR so that we will not keep loopping over it
1020 					 */
1021 					pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]);
1022 					pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1023 				}
1024 			}
1025 		} else {
1026 			pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1027 			VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1028 			    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1029 			    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1030 			fq_cl->fcl_budget = 0;
1031 		}
1032 		if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1033 			if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1034 				/*
1035 				 * Move fq_cl in IR back to ER, so that they will inspected with priority
1036 				 * the next time the driver dequeues
1037 				 */
1038 				fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1039 			}
1040 			break;
1041 		}
1042 	}
1043 
1044 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1045 		TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1046 		VERIFY(TAILQ_EMPTY(grp_list));
1047 	}
1048 
1049 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
1050 	    fqs->fqs_ptype);
1051 
1052 	if (__probable(first_packet != NULL)) {
1053 		*first_packet = first;
1054 	}
1055 	if (last_packet != NULL) {
1056 		*last_packet = last;
1057 	}
1058 	if (retpktcnt != NULL) {
1059 		*retpktcnt = total_pktcnt;
1060 	}
1061 	if (retbytecnt != NULL) {
1062 		*retbytecnt = total_bytecnt;
1063 	}
1064 	if (next_tx_time != FQ_INVALID_TX_TS) {
1065 		ASSERT(next_tx_time > now);
1066 		fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1067 	}
1068 
1069 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1070 	fq_if_purge_empty_flow_list(fqs, now, false);
1071 	return 0;
1072 }
1073 
1074 int
fq_if_dequeue_classq_multi(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1075 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
1076     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1077     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1078     u_int32_t *retbytecnt, uint8_t grp_idx)
1079 {
1080 	return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1081 	           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1082 }
1083 
1084 int
fq_if_dequeue_sc_classq_multi(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1085 fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
1086     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1087     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1088     uint8_t grp_idx)
1089 {
1090 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1091 
1092 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
1093 		return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt,
1094 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1095 	} else {
1096 		/*
1097 		 * take a shortcut here since there is no need to schedule
1098 		 * one single service class.
1099 		 */
1100 		return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt,
1101 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1102 	}
1103 }
1104 
1105 static int
fq_if_dequeue_sc_classq_multi_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1106 fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1107     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1108     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1109     uint8_t grp_idx)
1110 {
1111 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1112 	uint8_t pri;
1113 	u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1114 	fq_if_classq_t *fq_cl;
1115 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1116 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1117 	fq_if_append_pkt_t append_pkt;
1118 	flowq_dqlist_t fq_dqlist_head;
1119 	fq_if_group_t *fq_grp;
1120 	uint64_t now;
1121 
1122 	switch (fqs->fqs_ptype) {
1123 	case QP_MBUF:
1124 		append_pkt = fq_if_append_mbuf;
1125 		break;
1126 
1127 #if SKYWALK
1128 	case QP_PACKET:
1129 		append_pkt = fq_if_append_pkt;
1130 		break;
1131 #endif /* SKYWALK */
1132 
1133 	default:
1134 		VERIFY(0);
1135 		/* NOTREACHED */
1136 		__builtin_unreachable();
1137 	}
1138 
1139 	STAILQ_INIT(&fq_dqlist_head);
1140 	now = fq_codel_get_time();
1141 
1142 	pri = fq_if_service_to_priority(fqs, svc);
1143 	fq_grp = fq_if_find_grp(fqs, grp_idx);
1144 	fq_cl = &fq_grp->fqg_classq[pri];
1145 
1146 	/*
1147 	 * Now we have the queue for a particular service class. We need
1148 	 * to dequeue as many packets as needed, first from the new flows
1149 	 * and then from the old flows.
1150 	 */
1151 	while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1152 	    fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1153 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1154 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1155 		u_int32_t pktcnt = 0, bytecnt = 0;
1156 		bool all_paced = false;
1157 		uint64_t next_tx_time = FQ_INVALID_TX_TS;
1158 
1159 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1160 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1161 		    &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1162 		if (head.cp_mbuf != NULL) {
1163 			if (first.cp_mbuf == NULL) {
1164 				first = head;
1165 			} else {
1166 				ASSERT(last.cp_mbuf != NULL);
1167 				append_pkt(&last, &head);
1168 			}
1169 			last = tail;
1170 		}
1171 		total_pktcnt += pktcnt;
1172 		total_bytecnt += bytecnt;
1173 
1174 		if (next_tx_time != FQ_INVALID_TX_TS) {
1175 			ASSERT(next_tx_time > now);
1176 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1177 			fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1178 			break;
1179 		}
1180 	}
1181 
1182 	/*
1183 	 * Mark classq as IB if it's not idle, so that we can
1184 	 * start without re-init the bitmaps when it's switched
1185 	 * to combined mode.
1186 	 */
1187 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1188 		pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1189 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1190 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1191 	} else {
1192 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1193 		VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1194 		    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1195 		    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1196 	}
1197 
1198 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1199 
1200 	if (__probable(first_packet != NULL)) {
1201 		*first_packet = first;
1202 	}
1203 	if (last_packet != NULL) {
1204 		*last_packet = last;
1205 	}
1206 	if (retpktcnt != NULL) {
1207 		*retpktcnt = total_pktcnt;
1208 	}
1209 	if (retbytecnt != NULL) {
1210 		*retbytecnt = total_bytecnt;
1211 	}
1212 
1213 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1214 	fq_if_purge_empty_flow_list(fqs, now, false);
1215 	return 0;
1216 }
1217 
1218 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1219 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1220     uint32_t *bytesp, uint64_t now)
1221 {
1222 	fq_if_classq_t *fq_cl;
1223 	u_int32_t pkts, bytes;
1224 	pktsched_pkt_t pkt;
1225 	fq_if_group_t *grp;
1226 
1227 	fq_cl = &FQ_CLASSQ(fq);
1228 	grp = FQ_GROUP(fq);
1229 	pkts = bytes = 0;
1230 	_PKTSCHED_PKT_INIT(&pkt);
1231 	for (;;) {
1232 		fq_getq_flow(fqs, fq, &pkt, now);
1233 		if (pkt.pktsched_pkt_mbuf == NULL) {
1234 			VERIFY(pkt.pktsched_ptype == QP_INVALID);
1235 			break;
1236 		}
1237 		pkts++;
1238 		bytes += pktsched_get_pkt_len(&pkt);
1239 		if (__improbable(droptap_verbose > 0)) {
1240 			pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_PURGE_FLOW,
1241 			    __func__, __LINE__, 0);
1242 		} else {
1243 			pktsched_free_pkt(&pkt);
1244 		}
1245 	}
1246 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1247 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1248 
1249 	IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1250 
1251 	/* move through the flow queue states */
1252 	VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1253 	if (fq->fq_flags & FQF_NEW_FLOW) {
1254 		fq_if_empty_new_flow(fq, fq_cl);
1255 	}
1256 	if (fq->fq_flags & FQF_OLD_FLOW) {
1257 		fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1258 	}
1259 	if (fq->fq_flags & FQF_EMPTY_FLOW) {
1260 		fq_if_purge_empty_flow(fqs, fq);
1261 		fq = NULL;
1262 	}
1263 
1264 	if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1265 		int i;
1266 		for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1267 			pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1268 		}
1269 	}
1270 
1271 	if (pktsp != NULL) {
1272 		*pktsp = pkts;
1273 	}
1274 	if (bytesp != NULL) {
1275 		*bytesp = bytes;
1276 	}
1277 }
1278 
1279 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1280 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1281 {
1282 	fq_t *fq, *tfq;
1283 	uint64_t now;
1284 
1285 	now = fq_codel_get_time();
1286 	/*
1287 	 * Take each flow from new/old flow list and flush mbufs
1288 	 * in that flow
1289 	 */
1290 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1291 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1292 	}
1293 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1294 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1295 	}
1296 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1297 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1298 
1299 	STAILQ_INIT(&fq_cl->fcl_new_flows);
1300 	STAILQ_INIT(&fq_cl->fcl_old_flows);
1301 	fq_cl->fcl_budget = 0;
1302 }
1303 
1304 static void
fq_if_purge(fq_if_t * fqs)1305 fq_if_purge(fq_if_t *fqs)
1306 {
1307 	uint64_t now;
1308 	fq_if_group_t *grp;
1309 	int i;
1310 
1311 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1312 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1313 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1314 			continue;
1315 		}
1316 
1317 		grp = fq_if_find_grp(fqs, grp_idx);
1318 		fq_if_purge_grp(fqs, grp);
1319 	}
1320 
1321 	now = fq_codel_get_time();
1322 	fq_if_purge_empty_flow_list(fqs, now, true);
1323 
1324 	VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1325 	VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1326 
1327 	fqs->fqs_large_flow = NULL;
1328 	for (i = 0; i < fqs->fqs_flows_count; i++) {
1329 		VERIFY(LIST_EMPTY(&fqs->fqs_flows[i]));
1330 	}
1331 
1332 	IFCQ_LEN(fqs->fqs_ifq) = 0;
1333 	IFCQ_BYTES(fqs->fqs_ifq) = 0;
1334 }
1335 
1336 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1337 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1338 {
1339 	fq_t *fq;
1340 	uint64_t now;
1341 	fq_if_group_t *grp;
1342 
1343 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1344 	req->packets = req->bytes = 0;
1345 	VERIFY(req->flow != 0);
1346 
1347 	now = fq_codel_get_time();
1348 
1349 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1350 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1351 			continue;
1352 		}
1353 		uint32_t bytes = 0, pkts = 0;
1354 
1355 		grp = fq_if_find_grp(fqs, grp_idx);
1356 		/*
1357 		 * Packet and traffic type are needed only if we want
1358 		 * to create a flow queue.
1359 		 */
1360 		fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C);
1361 		if (fq != NULL) {
1362 			fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1363 			req->bytes += bytes;
1364 			req->packets += pkts;
1365 		}
1366 	}
1367 }
1368 
1369 static uint32_t
fq_if_calc_quantum(struct ifnet * ifp)1370 fq_if_calc_quantum(struct ifnet *ifp)
1371 {
1372 	uint32_t quantum;
1373 
1374 	switch (ifp->if_family) {
1375 	case IFNET_FAMILY_ETHERNET:
1376 		VERIFY(ifp->if_mtu <= IF_MAXMTU);
1377 		quantum = ifp->if_mtu + ETHER_HDR_LEN;
1378 		break;
1379 
1380 	case IFNET_FAMILY_CELLULAR:
1381 	case IFNET_FAMILY_IPSEC:
1382 	case IFNET_FAMILY_UTUN:
1383 		VERIFY(ifp->if_mtu <= UINT16_MAX);
1384 		quantum = ifp->if_mtu;
1385 		break;
1386 
1387 	default:
1388 		quantum = FQ_CODEL_DEFAULT_QUANTUM;
1389 		break;
1390 	}
1391 
1392 	if ((ifp->if_hwassist & IFNET_TSOF) != 0) {
1393 		VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1394 		VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1395 		quantum = MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1396 		quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1397 	}
1398 
1399 	quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1400 #if DEBUG || DEVELOPMENT
1401 	quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1402 #endif /* DEBUG || DEVELOPMENT */
1403 	VERIFY(quantum != 0);
1404 	return quantum;
1405 }
1406 
1407 static void
fq_if_mtu_update(fq_if_t * fqs)1408 fq_if_mtu_update(fq_if_t *fqs)
1409 {
1410 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q)                     \
1411 	(_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum =        \
1412 	    FQ_CODEL_QUANTUM_ ## _s(_q)                                 \
1413 
1414 	uint32_t quantum;
1415 	fq_if_group_t *grp;
1416 
1417 	quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1418 
1419 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1420 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1421 			continue;
1422 		}
1423 
1424 		grp = fq_if_find_grp(fqs, grp_idx);
1425 
1426 		if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1427 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1428 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1429 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1430 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1431 		} else {
1432 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1433 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1434 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1435 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1436 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1437 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1438 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1439 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1440 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1441 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1442 		}
1443 	}
1444 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1445 }
1446 
1447 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1448 fq_if_event(fq_if_t *fqs, cqev_t ev)
1449 {
1450 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1451 
1452 	switch (ev) {
1453 	case CLASSQ_EV_LINK_UP:
1454 	case CLASSQ_EV_LINK_DOWN:
1455 		fq_if_purge(fqs);
1456 		break;
1457 	case CLASSQ_EV_LINK_MTU:
1458 		fq_if_mtu_update(fqs);
1459 		break;
1460 	default:
1461 		break;
1462 	}
1463 }
1464 
1465 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1466 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1467 {
1468 	fq_if_purge_classq(fqs, fq_cl);
1469 	fqs->fqs_throttle = 1;
1470 	fq_cl->fcl_stat.fcl_throttle_on++;
1471 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1472 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1473 }
1474 
1475 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1476 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1477 {
1478 	VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1479 	fqs->fqs_throttle = 0;
1480 	fq_cl->fcl_stat.fcl_throttle_off++;
1481 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1482 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1483 }
1484 
1485 
1486 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1487 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1488 {
1489 	struct ifclassq *ifq = fqs->fqs_ifq;
1490 	uint8_t index;
1491 	fq_if_group_t *grp;
1492 
1493 #if !MACH_ASSERT
1494 #pragma unused(ifq)
1495 #endif
1496 	IFCQ_LOCK_ASSERT_HELD(ifq);
1497 
1498 	if (!tr->set) {
1499 		tr->level = fqs->fqs_throttle;
1500 		return 0;
1501 	}
1502 
1503 	if (tr->level == fqs->fqs_throttle) {
1504 		return EALREADY;
1505 	}
1506 
1507 	/* Throttling is allowed on BK_SYS class only */
1508 	index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1509 
1510 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1511 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1512 			continue;
1513 		}
1514 		grp = fq_if_find_grp(fqs, grp_idx);
1515 		switch (tr->level) {
1516 		case IFNET_THROTTLE_OFF:
1517 			fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1518 			break;
1519 		case IFNET_THROTTLE_OPPORTUNISTIC:
1520 			fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1521 			break;
1522 		default:
1523 			break;
1524 		}
1525 	}
1526 	return 0;
1527 }
1528 
1529 static inline boolean_t
fq_if_is_fq_cl_paced(fq_if_classq_t * fq_cl,uint64_t now)1530 fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1531 {
1532 	if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1533 		return true;
1534 	}
1535 
1536 	fq_cl->fcl_flags &= ~FCL_PACED;
1537 	fq_cl->fcl_next_tx_time = 0;
1538 	return false;
1539 }
1540 
1541 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat,uint64_t now)1542 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1543 {
1544 	uint8_t pri;
1545 	fq_if_classq_t *fq_cl;
1546 
1547 	ASSERT(stat != NULL);
1548 	pri = fq_if_service_to_priority(fqs, stat->sc);
1549 
1550 	fq_cl = &grp->fqg_classq[pri];
1551 	stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1552 	stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1553 
1554 	if (ifclassq_enable_pacing && ifclassq_enable_l4s &&
1555 	    fq_if_is_fq_cl_paced(fq_cl, now)) {
1556 		stat->packets = 0;
1557 		stat->bytes = 0;
1558 	}
1559 }
1560 
1561 static boolean_t
fq_if_is_grp_all_paced(fq_if_group_t * grp)1562 fq_if_is_grp_all_paced(fq_if_group_t *grp)
1563 {
1564 	fq_if_classq_t *fq_cl;
1565 	uint64_t now;
1566 
1567 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1568 		return false;
1569 	}
1570 
1571 	now = fq_codel_get_time();
1572 	for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1573 		fq_cl = &grp->fqg_classq[fq_cl_idx];
1574 		if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1575 			continue;
1576 		}
1577 		if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1578 			return false;
1579 		}
1580 	}
1581 
1582 	return true;
1583 }
1584 
1585 boolean_t
fq_if_is_all_paced(struct ifclassq * ifq)1586 fq_if_is_all_paced(struct ifclassq *ifq)
1587 {
1588 	fq_if_group_t *grp;
1589 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1590 
1591 	IFCQ_LOCK_ASSERT_HELD(ifq);
1592 
1593 	if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1594 		return false;
1595 	}
1596 
1597 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1598 		grp = fqs->fqs_classq_groups[grp_idx];
1599 		if (grp == NULL || FQG_BYTES(grp) == 0) {
1600 			continue;
1601 		}
1602 
1603 		if (!fq_if_is_grp_all_paced(grp)) {
1604 			return false;
1605 		}
1606 	}
1607 
1608 	return true;
1609 }
1610 
1611 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1612 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1613 {
1614 	cqrq_stat_sc_t grp_sc_stat;
1615 	fq_if_group_t *grp;
1616 	uint64_t now = fq_codel_get_time();
1617 
1618 	if (stat == NULL) {
1619 		return;
1620 	}
1621 	grp_sc_stat.sc = stat->sc;
1622 	stat->packets = 0;
1623 	stat->bytes = 0;
1624 
1625 	if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1626 		if (stat->sc == MBUF_SC_UNSPEC) {
1627 			if (!fq_if_is_all_paced(fqs->fqs_ifq)) {
1628 				stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1629 				stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1630 			}
1631 		} else {
1632 			for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1633 				grp = fqs->fqs_classq_groups[grp_idx];
1634 				if (grp == NULL) {
1635 					continue;
1636 				}
1637 
1638 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1639 				stat->packets += grp_sc_stat.packets;
1640 				stat->bytes += grp_sc_stat.bytes;
1641 			}
1642 		}
1643 		return;
1644 	}
1645 
1646 	if (stat->sc == MBUF_SC_UNSPEC) {
1647 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1648 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1649 				if (fq_if_is_grp_all_paced(grp)) {
1650 					continue;
1651 				}
1652 				stat->packets += FQG_LEN(grp);
1653 				stat->bytes += FQG_BYTES(grp);
1654 			}
1655 		} else {
1656 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1657 			if (!fq_if_is_grp_all_paced(grp)) {
1658 				stat->packets = FQG_LEN(grp);
1659 				stat->bytes = FQG_BYTES(grp);
1660 			}
1661 		}
1662 	} else {
1663 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1664 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1665 				if (fq_if_is_grp_all_paced(grp)) {
1666 					continue;
1667 				}
1668 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1669 				stat->packets += grp_sc_stat.packets;
1670 				stat->bytes += grp_sc_stat.bytes;
1671 			}
1672 		} else {
1673 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1674 			fq_if_grp_stat_sc(fqs, grp, stat, now);
1675 		}
1676 	}
1677 }
1678 
1679 int
fq_if_request_classq(struct ifclassq * ifq,cqrq_t rq,void * arg)1680 fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg)
1681 {
1682 	int err = 0;
1683 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1684 
1685 	IFCQ_LOCK_ASSERT_HELD(ifq);
1686 
1687 	/*
1688 	 * These are usually slow operations, convert the lock ahead of time
1689 	 */
1690 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1691 	switch (rq) {
1692 	case CLASSQRQ_PURGE:
1693 		fq_if_purge(fqs);
1694 		break;
1695 	case CLASSQRQ_PURGE_SC:
1696 		fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1697 		break;
1698 	case CLASSQRQ_EVENT:
1699 		fq_if_event(fqs, *(cqev_t *)arg);
1700 		break;
1701 	case CLASSQRQ_THROTTLE:
1702 		fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1703 		break;
1704 	case CLASSQRQ_STAT_SC:
1705 		fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1706 		break;
1707 	}
1708 	return err;
1709 }
1710 
1711 int
fq_if_setup_ifclassq(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)1712 fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
1713     classq_pkt_type_t ptype)
1714 {
1715 	fq_if_t *fqs = NULL;
1716 	int err = 0;
1717 
1718 	IFCQ_LOCK_ASSERT_HELD(ifq);
1719 	VERIFY(ifq->ifcq_disc == NULL);
1720 	VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1721 
1722 	fqs = fq_if_alloc(ifq, ptype);
1723 	if (fqs == NULL) {
1724 		return ENOMEM;
1725 	}
1726 	if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1727 		fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1728 		fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1729 	} else {
1730 		fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1731 	}
1732 
1733 	err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1734 	if (err != 0) {
1735 		os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
1736 		    "failed to attach fq_if: %d\n", __func__, err);
1737 		fq_if_destroy(fqs);
1738 		return err;
1739 	}
1740 
1741 	/*
1742 	 * Always create one group. If qset 0 is added later,
1743 	 * this group will be updated.
1744 	 */
1745 	err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
1746 	if (err != 0) {
1747 		os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
1748 		    "failed to create a fq group: %d\n", __func__, err);
1749 		fq_if_destroy(fqs);
1750 	}
1751 
1752 	return err;
1753 }
1754 
1755 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,uint32_t flowid,mbuf_svc_class_t svc_class,uint64_t now,bool create,fq_tfc_type_t tfc_type)1756 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid,
1757     mbuf_svc_class_t svc_class, uint64_t now, bool create,
1758     fq_tfc_type_t tfc_type)
1759 {
1760 	fq_t *fq = NULL;
1761 	flowq_list_t *fq_list;
1762 	fq_if_classq_t *fq_cl;
1763 	uint32_t fqs_hash_id;
1764 	u_int8_t scidx;
1765 
1766 	scidx = fq_if_service_to_priority(fqs, svc_class);
1767 
1768 	fqs_hash_id = fq_if_flow_hash_id(flowid);
1769 
1770 	fq_list = &fqs->fqs_flows[fqs_hash_id];
1771 
1772 	LIST_FOREACH(fq, fq_list, fq_hashlink) {
1773 		if (fq->fq_flowhash == flowid &&
1774 		    fq->fq_sc_index == scidx &&
1775 		    fq->fq_tfc_type == tfc_type &&
1776 		    fq->fq_group == fq_grp) {
1777 			break;
1778 		}
1779 	}
1780 	if (fq == NULL && create) {
1781 		/* If the flow is not already on the list, allocate it */
1782 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1783 		fq = fq_alloc(fqs->fqs_ptype);
1784 		if (fq != NULL) {
1785 			fq->fq_flowhash = flowid;
1786 			fq->fq_sc_index = scidx;
1787 			fq->fq_group = fq_grp;
1788 			fq->fq_tfc_type = tfc_type;
1789 			fq_cl = &FQ_CLASSQ(fq);
1790 			fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
1791 			fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1792 			fq->fq_next_tx_time = FQ_INVALID_TX_TS;
1793 			LIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
1794 			fq_cl->fcl_stat.fcl_flows_cnt++;
1795 		}
1796 		KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
1797 		    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1798 		    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1799 	} else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
1800 		fq_if_reuse_empty_flow(fqs, fq, now);
1801 	}
1802 
1803 	/*
1804 	 * If getq time is not set because this is the first packet or after
1805 	 * idle time, set it now so that we can detect a stall.
1806 	 */
1807 	if (fq != NULL && fq->fq_getqtime == 0) {
1808 		fq->fq_getqtime = now;
1809 	}
1810 
1811 	return fq;
1812 }
1813 
1814 void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)1815 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
1816 {
1817 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
1818 	LIST_REMOVE(fq, fq_hashlink);
1819 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1820 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1821 		fq_if_flow_feedback(fqs, fq, fq_cl);
1822 	}
1823 	KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
1824 	    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1825 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1826 	fq_destroy(fq, fqs->fqs_ptype);
1827 }
1828 
1829 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)1830 fq_if_at_drop_limit(fq_if_t *fqs)
1831 {
1832 	return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
1833 	       TRUE : FALSE;
1834 }
1835 
1836 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)1837 fq_if_almost_at_drop_limit(fq_if_t *fqs)
1838 {
1839 	/*
1840 	 * Whether we are above 90% of the queue limit. This is used to tell if we
1841 	 * can stop flow controlling the largest flow.
1842 	 */
1843 	return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
1844 }
1845 
1846 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)1847 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
1848 {
1849 	ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
1850 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1851 	STAILQ_NEXT(fq, fq_actlink) = NULL;
1852 	fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
1853 	fq->fq_empty_purge_time = 0;
1854 	fq->fq_getqtime = 0;
1855 	fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1856 	fqs->fqs_empty_list_cnt--;
1857 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
1858 	fq_cl->fcl_stat.fcl_flows_cnt++;
1859 }
1860 
1861 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1862 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1863     uint64_t now)
1864 {
1865 	ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
1866 	fq->fq_empty_purge_time = now + fq_empty_purge_delay;
1867 	TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
1868 	fq->fq_flags |= FQF_EMPTY_FLOW;
1869 	FQ_CLEAR_OVERWHELMING(fq);
1870 	fqs->fqs_empty_list_cnt++;
1871 	/*
1872 	 * fcl_flows_cnt is used in budget determination for the class.
1873 	 * empty flow shouldn't contribute to the budget.
1874 	 */
1875 	fq_cl->fcl_stat.fcl_flows_cnt--;
1876 }
1877 
1878 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)1879 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
1880 {
1881 	fq_if_classq_t *fq_cl;
1882 	fq_cl = &FQ_CLASSQ(fq);
1883 
1884 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
1885 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1886 	fq->fq_flags &= ~FQF_EMPTY_FLOW;
1887 	fqs->fqs_empty_list_cnt--;
1888 	/* Remove from the hash list and free the flow queue */
1889 	fq_if_destroy_flow(fqs, fq_cl, fq);
1890 }
1891 
1892 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)1893 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
1894 {
1895 	fq_t *fq, *tmp;
1896 	int i = 0;
1897 
1898 	if (fqs->fqs_empty_list_cnt == 0) {
1899 		ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
1900 		return;
1901 	}
1902 
1903 	TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
1904 		if (!purge_all && ((now < fq->fq_empty_purge_time) ||
1905 		    (i++ == FQ_EMPTY_PURGE_MAX))) {
1906 			break;
1907 		}
1908 		fq_if_purge_empty_flow(fqs, fq);
1909 	}
1910 
1911 	if (__improbable(purge_all)) {
1912 		VERIFY(fqs->fqs_empty_list_cnt == 0);
1913 		VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1914 	}
1915 }
1916 
1917 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1918 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1919     uint64_t now)
1920 {
1921 	/*
1922 	 * Remove the flow queue from the old flows list.
1923 	 */
1924 	STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
1925 	fq->fq_flags &= ~FQF_OLD_FLOW;
1926 	fq_cl->fcl_stat.fcl_oldflows_cnt--;
1927 	VERIFY(fq->fq_bytes == 0);
1928 
1929 	/* release any flow control */
1930 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1931 		fq_if_flow_feedback(fqs, fq, fq_cl);
1932 	}
1933 
1934 	/* move the flow queue to empty flows list */
1935 	fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
1936 }
1937 
1938 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)1939 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
1940 {
1941 	/* Move to the end of old queue list */
1942 	STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
1943 	    flowq, fq_actlink);
1944 	fq->fq_flags &= ~FQF_NEW_FLOW;
1945 	fq_cl->fcl_stat.fcl_newflows_cnt--;
1946 
1947 	STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
1948 	fq->fq_flags |= FQF_OLD_FLOW;
1949 	fq_cl->fcl_stat.fcl_oldflows_cnt++;
1950 }
1951 
1952 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)1953 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
1954 {
1955 	fq_t *fq = fqs->fqs_large_flow;
1956 	fq_if_classq_t *fq_cl;
1957 	pktsched_pkt_t pkt;
1958 	volatile uint32_t *__single pkt_flags;
1959 	uint64_t *__single pkt_timestamp;
1960 
1961 	if (fq == NULL) {
1962 		return;
1963 	}
1964 	/* queue can not be empty on the largest flow */
1965 	VERIFY(!fq_empty(fq, fqs->fqs_ptype));
1966 
1967 	fq_cl = &FQ_CLASSQ(fq);
1968 	_PKTSCHED_PKT_INIT(&pkt);
1969 	fq_getq_flow_internal(fqs, fq, &pkt);
1970 	ASSERT(pkt.pktsched_ptype != QP_INVALID);
1971 
1972 	pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
1973 	    NULL, NULL, NULL);
1974 
1975 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1976 	*pkt_timestamp = 0;
1977 	switch (pkt.pktsched_ptype) {
1978 	case QP_MBUF:
1979 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
1980 		break;
1981 #if SKYWALK
1982 	case QP_PACKET:
1983 		/* sanity check */
1984 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
1985 		break;
1986 #endif /* SKYWALK */
1987 	default:
1988 		VERIFY(0);
1989 		/* NOTREACHED */
1990 		__builtin_unreachable();
1991 	}
1992 
1993 	if (fq_empty(fq, fqs->fqs_ptype)) {
1994 		fqs->fqs_large_flow = NULL;
1995 		if (fq->fq_flags & FQF_OLD_FLOW) {
1996 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1997 		} else {
1998 			VERIFY(fq->fq_flags & FQF_NEW_FLOW);
1999 			fq_if_empty_new_flow(fq, fq_cl);
2000 		}
2001 	}
2002 	IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
2003 
2004 	if (__improbable(droptap_verbose > 0)) {
2005 		pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_DROP,
2006 		    __func__, __LINE__, 0);
2007 	} else {
2008 		pktsched_free_pkt(&pkt);
2009 	}
2010 	fq_cl->fcl_stat.fcl_drop_overflow++;
2011 }
2012 
2013 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)2014 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
2015 {
2016 	fq_t *prev_fq;
2017 
2018 	if (fqs->fqs_large_flow != NULL &&
2019 	    fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2020 		fqs->fqs_large_flow = NULL;
2021 	}
2022 
2023 	if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2024 		return;
2025 	}
2026 
2027 	prev_fq = fqs->fqs_large_flow;
2028 	if (prev_fq == NULL) {
2029 		if (!fq_empty(fq, fqs->fqs_ptype)) {
2030 			fqs->fqs_large_flow = fq;
2031 		}
2032 		return;
2033 	} else if (fq->fq_bytes > prev_fq->fq_bytes) {
2034 		fqs->fqs_large_flow = fq;
2035 	}
2036 }
2037 
2038 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)2039 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2040     fq_t *fq, fq_if_classq_t *fq_cl)
2041 {
2042 	struct flowadv_fcentry *fce;
2043 
2044 #if DEBUG || DEVELOPMENT
2045 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2046 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2047 		return TRUE;
2048 	}
2049 #endif /* DEBUG || DEVELOPMENT */
2050 
2051 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2052 		if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2053 		    fce->fce_flowid == fq->fq_flowhash) {
2054 			/* Already on flowcontrol list */
2055 			return TRUE;
2056 		}
2057 	}
2058 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2059 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2060 	if (fce != NULL) {
2061 		/* XXX Add number of bytes in the queue */
2062 		STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2063 		fq_cl->fcl_stat.fcl_flow_control++;
2064 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2065 		    "flow: 0x%x, iface: %s, B:%u\n", __func__,
2066 		    fq_cl->fcl_stat.fcl_flow_control,
2067 		    fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2068 		    if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2069 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2070 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2071 		    fq->fq_bytes, fq->fq_min_qdelay);
2072 	}
2073 
2074 	if (fce != NULL && fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
2075 		kern_channel_flowadv_set(fce);
2076 	}
2077 
2078 	return (fce != NULL) ? TRUE : FALSE;
2079 }
2080 
2081 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)2082 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2083 {
2084 	STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2085 	STAILQ_NEXT(fce, fce_link) = NULL;
2086 	flowadv_add_entry(fce);
2087 }
2088 
2089 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)2090 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2091 {
2092 	struct flowadv_fcentry *fce = NULL;
2093 
2094 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2095 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2096 		if (fce->fce_flowid == fq->fq_flowhash) {
2097 			break;
2098 		}
2099 	}
2100 	if (fce != NULL) {
2101 		fq_cl->fcl_stat.fcl_flow_feedback++;
2102 		fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2103 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2104 		    "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2105 		    fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2106 		    fce->fce_flowsrc_type, fce->fce_flowid,
2107 		    if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2108 		    fq->fq_bytes);
2109 		fq_if_remove_fcentry(fqs, fce);
2110 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2111 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2112 		    fq->fq_bytes, fq->fq_min_qdelay);
2113 	}
2114 	fq->fq_flags &= ~FQF_FLOWCTL_ON;
2115 }
2116 
2117 boolean_t
fq_if_report_ce(fq_if_t * fqs,pktsched_pkt_t * pkt,uint32_t ce_cnt,uint32_t pkt_cnt)2118 fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt,
2119     uint32_t pkt_cnt)
2120 {
2121 	struct flowadv_fcentry *fce;
2122 
2123 #if DEBUG || DEVELOPMENT
2124 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2125 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2126 		return TRUE;
2127 	}
2128 #endif /* DEBUG || DEVELOPMENT */
2129 
2130 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2131 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2132 	if (fce != NULL) {
2133 		fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2134 		fce->fce_ce_cnt = ce_cnt;
2135 		fce->fce_pkts_since_last_report = pkt_cnt;
2136 
2137 		flowadv_add_entry(fce);
2138 	}
2139 	return (fce != NULL) ? TRUE : FALSE;
2140 }
2141 
2142 
2143 void
fq_if_dequeue(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now,bool * fq_cl_paced,uint64_t * next_tx_time)2144 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2145     int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2146     uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2147     bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2148     uint64_t *next_tx_time)
2149 {
2150 	fq_t *fq = NULL, *tfq = NULL;
2151 	flowq_stailq_t temp_stailq;
2152 	uint32_t pktcnt, bytecnt;
2153 	boolean_t qempty, limit_reached = FALSE;
2154 	bool all_paced = true;
2155 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2156 	fq_getq_flow_t fq_getq_flow_fn;
2157 	classq_pkt_t *head, *tail;
2158 	uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2159 
2160 	switch (fqs->fqs_ptype) {
2161 	case QP_MBUF:
2162 		fq_getq_flow_fn = fq_getq_flow_mbuf;
2163 		break;
2164 
2165 #if SKYWALK
2166 	case QP_PACKET:
2167 		fq_getq_flow_fn = fq_getq_flow_kpkt;
2168 		break;
2169 #endif /* SKYWALK */
2170 
2171 	default:
2172 		VERIFY(0);
2173 		/* NOTREACHED */
2174 		__builtin_unreachable();
2175 	}
2176 
2177 	/*
2178 	 * maximum byte limit should not be greater than the budget for
2179 	 * this class
2180 	 */
2181 	if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2182 		bytelimit = fq_cl->fcl_budget;
2183 	}
2184 
2185 	VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2186 	pktcnt = bytecnt = 0;
2187 	STAILQ_INIT(&temp_stailq);
2188 
2189 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2190 		ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2191 		    FQF_NEW_FLOW);
2192 		uint64_t fq_tx_time;
2193 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2194 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2195 			if (fq_tx_time < fq_cl_tx_time) {
2196 				fq_cl_tx_time = fq_tx_time;
2197 			}
2198 			continue;
2199 		}
2200 		all_paced = false;
2201 
2202 		if (fq_dqlist != NULL) {
2203 			if (!fq->fq_in_dqlist) {
2204 				fq_dqlist_add(fq_dqlist, fq);
2205 			}
2206 			head = &fq->fq_dq_head;
2207 			tail = &fq->fq_dq_tail;
2208 		} else {
2209 			ASSERT(!fq->fq_in_dqlist);
2210 			head = top;
2211 			tail = &last;
2212 		}
2213 
2214 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2215 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2216 
2217 		/*
2218 		 * From RFC 8290:
2219 		 * if that queue has a negative number of credits (i.e., it has already
2220 		 * dequeued at least a quantum of bytes), it is given an additional
2221 		 * quantum of credits, the queue is put onto _the end of_ the list of
2222 		 * old queues, and the routine selects the next queue and starts again.
2223 		 */
2224 		if (fq->fq_deficit <= 0 || qempty) {
2225 			fq->fq_deficit += fq_cl->fcl_quantum;
2226 			fq_if_empty_new_flow(fq, fq_cl);
2227 		}
2228 		//TODO: add credit when it's now paced? so that the fq is trated the same as empty
2229 
2230 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2231 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2232 			if (fq_tx_time < fq_cl_tx_time) {
2233 				fq_cl_tx_time = fq_tx_time;
2234 			}
2235 		}
2236 
2237 		if (limit_reached) {
2238 			goto done;
2239 		}
2240 	}
2241 
2242 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2243 		VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2244 		    FQF_OLD_FLOW);
2245 		bool destroy = true;
2246 		uint64_t fq_tx_time;
2247 
2248 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2249 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2250 			if (fq_tx_time < fq_cl_tx_time) {
2251 				fq_cl_tx_time = fq_tx_time;
2252 			}
2253 			continue;
2254 		}
2255 		all_paced = false;
2256 
2257 		if (fq_dqlist != NULL) {
2258 			if (!fq->fq_in_dqlist) {
2259 				fq_dqlist_add(fq_dqlist, fq);
2260 			}
2261 			head = &fq->fq_dq_head;
2262 			tail = &fq->fq_dq_tail;
2263 			destroy = false;
2264 		} else {
2265 			ASSERT(!fq->fq_in_dqlist);
2266 			head = top;
2267 			tail = &last;
2268 		}
2269 
2270 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2271 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2272 
2273 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2274 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2275 			if (fq_tx_time < fq_cl_tx_time) {
2276 				fq_cl_tx_time = fq_tx_time;
2277 			}
2278 		}
2279 
2280 		if (qempty) {
2281 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2282 		} else if (fq->fq_deficit <= 0) {
2283 			STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2284 			    flowq, fq_actlink);
2285 			/*
2286 			 * Move to the end of the old queues list. We do not
2287 			 * need to update the flow count since this flow
2288 			 * will be added to the tail again
2289 			 */
2290 			STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2291 			fq->fq_deficit += fq_cl->fcl_quantum;
2292 		}
2293 		if (limit_reached) {
2294 			break;
2295 		}
2296 	}
2297 
2298 done:
2299 	if (all_paced) {
2300 		fq_cl->fcl_flags |= FCL_PACED;
2301 		fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2302 	}
2303 	if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2304 		STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2305 	} else if (!STAILQ_EMPTY(&temp_stailq)) {
2306 		fq_cl->fcl_old_flows = temp_stailq;
2307 	}
2308 	if (last.cp_mbuf != NULL) {
2309 		VERIFY(top->cp_mbuf != NULL);
2310 		if (bottom != NULL) {
2311 			*bottom = last;
2312 		}
2313 	}
2314 	if (retpktcnt != NULL) {
2315 		*retpktcnt = pktcnt;
2316 	}
2317 	if (retbytecnt != NULL) {
2318 		*retbytecnt = bytecnt;
2319 	}
2320 	if (fq_cl_paced != NULL) {
2321 		*fq_cl_paced = all_paced;
2322 	}
2323 	if (next_tx_time != NULL) {
2324 		*next_tx_time = fq_cl_tx_time;
2325 	}
2326 }
2327 
2328 void
fq_if_teardown_ifclassq(struct ifclassq * ifq)2329 fq_if_teardown_ifclassq(struct ifclassq *ifq)
2330 {
2331 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2332 
2333 	IFCQ_LOCK_ASSERT_HELD(ifq);
2334 	VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL);
2335 	fq_if_destroy(fqs);
2336 	ifq->ifcq_disc = NULL;
2337 	ifclassq_detach(ifq);
2338 }
2339 
2340 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)2341 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2342     struct fq_codel_flowstats *flowstat)
2343 {
2344 	bzero(flowstat, sizeof(*flowstat));
2345 	flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2346 	flowstat->fqst_bytes = fq->fq_bytes;
2347 	flowstat->fqst_flowhash = fq->fq_flowhash;
2348 	if (fq->fq_flags & FQF_NEW_FLOW) {
2349 		flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2350 	}
2351 	if (fq->fq_flags & FQF_OLD_FLOW) {
2352 		flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2353 	}
2354 	if (fq->fq_flags & FQF_DELAY_HIGH) {
2355 		flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2356 	}
2357 	if (fq->fq_flags & FQF_FLOWCTL_ON) {
2358 		flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2359 	}
2360 	if (fqs->fqs_large_flow == fq) {
2361 		flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2362 	}
2363 }
2364 
2365 int
fq_if_getqstats_ifclassq(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2366 fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2367     struct if_ifclassq_stats *ifqs)
2368 {
2369 	struct fq_codel_classstats *fcls;
2370 	fq_if_classq_t *fq_cl;
2371 	fq_if_t *fqs;
2372 	fq_t *fq = NULL;
2373 	fq_if_group_t *grp;
2374 	u_int32_t i, flowstat_cnt;
2375 
2376 	if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2377 		return EINVAL;
2378 	}
2379 
2380 	fqs = (fq_if_t *)ifq->ifcq_disc;
2381 	if (fqs->fqs_classq_groups[gid] == NULL) {
2382 		return ENXIO;
2383 	}
2384 
2385 	fcls = &ifqs->ifqs_fq_codel_stats;
2386 
2387 	fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2388 	grp = fq_if_find_grp(fqs, gid);
2389 
2390 	fcls->fcls_pri = fq_cl->fcl_pri;
2391 	fcls->fcls_service_class = fq_cl->fcl_service_class;
2392 	fcls->fcls_quantum = fq_cl->fcl_quantum;
2393 	fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2394 	fcls->fcls_budget = fq_cl->fcl_budget;
2395 	fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2396 	fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2397 	fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2398 	fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2399 	fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2400 	fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2401 	fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2402 	fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2403 	fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2404 	fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2405 	fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2406 	fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2407 	fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2408 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2409 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2410 	fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2411 	fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2412 	fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2413 	fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2414 	fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2415 	fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2416 	fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2417 	fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2418 	fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2419 	fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2420 	fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2421 	fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2422 	fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2423 	fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2424 	fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2425 	fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2426 	fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2427 	fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2428 	fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2429 	fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2430 
2431 	/* Gather per flow stats */
2432 	flowstat_cnt = min((fcls->fcls_newflows_cnt +
2433 	    fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2434 	i = 0;
2435 	STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2436 		if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2437 			break;
2438 		}
2439 
2440 		/* leave space for a few old flows */
2441 		if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2442 		    i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2443 			break;
2444 		}
2445 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2446 		i++;
2447 	}
2448 	STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2449 		if (i >= flowstat_cnt) {
2450 			break;
2451 		}
2452 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2453 		i++;
2454 	}
2455 	VERIFY(i <= flowstat_cnt);
2456 	fcls->fcls_flowstats_cnt = i;
2457 	return 0;
2458 }
2459 
2460 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2461 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2462 {
2463 #define _FQ_CLASSQ_INIT(_grp, _s, _q)                      \
2464     fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX,         \
2465 	FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s),     \
2466 	MBUF_SC_ ## _s );
2467 
2468 	fq_if_group_t *grp;
2469 	fq_if_t *fqs;
2470 	uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2471 	struct ifnet *ifp = ifcq->ifcq_ifp;
2472 
2473 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2474 
2475 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2476 
2477 	if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2478 		grp = fqs->fqs_classq_groups[grp_idx];
2479 		goto update;
2480 	}
2481 
2482 	if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2483 		return EINVAL;
2484 	}
2485 
2486 	grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2487 	if (grp == NULL) {
2488 		return ENOMEM;
2489 	}
2490 
2491 	fqs->fqs_classq_groups[grp_idx] = grp;
2492 	grp->fqg_index = grp_idx;
2493 
2494 	quantum = fq_if_calc_quantum(ifp);
2495 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2496 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2497 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2498 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2499 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2500 	} else {
2501 		/* SIG shares same INDEX with VI */
2502 		_CASSERT(SCIDX_SIG == SCIDX_VI);
2503 		_CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2504 
2505 		_FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2506 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2507 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2508 		_FQ_CLASSQ_INIT(grp, RD, quantum);
2509 		_FQ_CLASSQ_INIT(grp, OAM, quantum);
2510 		_FQ_CLASSQ_INIT(grp, AV, quantum);
2511 		_FQ_CLASSQ_INIT(grp, RV, quantum);
2512 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2513 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2514 		_FQ_CLASSQ_INIT(grp, CTL, quantum);
2515 	}
2516 
2517 update:
2518 	if (flags & IF_DEFAULT_GRP) {
2519 		fq_if_set_grp_combined(ifcq, grp_idx);
2520 		grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2521 	} else {
2522 		fq_if_set_grp_separated(ifcq, grp_idx);
2523 		grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2524 	}
2525 
2526 	calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2527 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2528 	    calc_flags);
2529 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2530 	    calc_flags | IF_CLASSQ_L4S);
2531 
2532 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2533 	    calc_flags);
2534 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2535 	    calc_flags | IF_CLASSQ_L4S);
2536 
2537 	return 0;
2538 #undef _FQ_CLASSQ_INIT
2539 }
2540 
2541 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2542 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2543 {
2544 	fq_if_group_t *grp;
2545 
2546 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2547 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2548 
2549 	grp = fqs->fqs_classq_groups[grp_idx];
2550 	VERIFY(grp != NULL);
2551 
2552 	return grp;
2553 }
2554 
2555 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2556 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2557 {
2558 	for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2559 		fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2560 	}
2561 
2562 	bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2563 	grp->fqg_len = 0;
2564 	grp->fqg_bytes = 0;
2565 	fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2566 }
2567 
2568 void
fq_if_destroy_grps(fq_if_t * fqs)2569 fq_if_destroy_grps(fq_if_t *fqs)
2570 {
2571 	fq_if_group_t *__single grp;
2572 
2573 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2574 
2575 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2576 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2577 			continue;
2578 		}
2579 
2580 		grp = fq_if_find_grp(fqs, grp_idx);
2581 		fq_if_purge_grp(fqs, grp);
2582 		zfree(fq_if_grp_zone, grp);
2583 		fqs->fqs_classq_groups[grp_idx] = NULL;
2584 	}
2585 }
2586 
2587 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2588 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2589 {
2590 	return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2591 }
2592 
2593 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2594 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2595 {
2596 	fq_if_t *fqs;
2597 	fq_if_group_t *grp;
2598 
2599 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2600 
2601 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2602 	grp = fq_if_find_grp(fqs, grp_idx);
2603 
2604 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
2605 		return;
2606 	}
2607 
2608 	/*
2609 	 * We keep the current fq_deficit and fcl_budget when combining a group.
2610 	 * That might disrupt the AQM but only for a moment.
2611 	 */
2612 	pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2613 	TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2614 }
2615 
2616 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2617 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2618 {
2619 	fq_if_t *fqs;
2620 	fq_if_group_t *grp;
2621 
2622 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2623 
2624 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2625 	grp = fq_if_find_grp(fqs, grp_idx);
2626 
2627 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2628 		return;
2629 	}
2630 
2631 	pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2632 	TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2633 }
2634