xref: /xnu-12377.41.6/bsd/net/pktsched/pktsched_fq_codel.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/droptap.h>
36 #include <net/classq/classq.h>
37 #include <net/classq/classq_fq_codel.h>
38 #include <net/pktsched/pktsched_ops.h>
39 #include <net/pktsched/pktsched_fq_codel.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
42 #include <mach/thread_act.h>
43 #include <kern/thread.h>
44 #include <kern/sched_prim.h>
45 
46 #include <skywalk/core/skywalk_var.h>
47 
48 #define FQ_CODEL_DEFAULT_QUANTUM 1500
49 
50 #define FQ_CODEL_QUANTUM_BK_SYS(_q)    (_q)
51 #define FQ_CODEL_QUANTUM_BK(_q)        (_q)
52 #define FQ_CODEL_QUANTUM_BE(_q)        (_q)
53 #define FQ_CODEL_QUANTUM_RD(_q)        (_q)
54 #define FQ_CODEL_QUANTUM_OAM(_q)       (_q)
55 #define FQ_CODEL_QUANTUM_AV(_q)        (_q * 2)
56 #define FQ_CODEL_QUANTUM_RV(_q)        (_q * 2)
57 #define FQ_CODEL_QUANTUM_VI(_q)        (_q * 2)
58 #define FQ_CODEL_QUANTUM_VO(_q)        ((_q * 2) / 5)
59 #define FQ_CODEL_QUANTUM_CTL(_q)       ((_q * 2) / 5)
60 
61 #define IFQ_DEF_C_TARGET_DELAY        (10ULL * 1000 * 1000)   /* 10 ms */
62 #define IFQ_DEF_C_UPDATE_INTERVAL     (100ULL * 1000 * 1000)  /* 100 ms */
63 #define IFQ_DEF_L4S_TARGET_DELAY        (2ULL * 1000 * 1000)   /* 2 ms */
64 #define IFQ_DEF_L4S_WIRELESS_TARGET_DELAY   (15ULL * 1000 * 1000)   /* 15 ms */
65 #define IFQ_DEF_L4S_UPDATE_INTERVAL     (100ULL * 1000 * 1000)  /* 100 ms */
66 #define IFQ_LL_C_TARGET_DELAY     (10ULL * 1000 * 1000)   /* 10 ms */
67 #define IFQ_LL_C_UPDATE_INTERVAL  (100ULL * 1000 * 1000)  /* 100 ms */
68 #define IFQ_LL_L4S_TARGET_DELAY     (2ULL * 1000 * 1000)   /* 2 ms */
69 #define IFQ_LL_L4S_WIRELESS_TARGET_DELAY   (15ULL * 1000 * 1000)   /* 15 ms */
70 #define IFQ_LL_L4S_UPDATE_INTERVAL  (100ULL * 1000 * 1000)  /* 100 ms */
71 
72 static uint64_t fq_if_def_c_target_qdelay = 0;
73 static uint64_t fq_if_def_c_update_interval = 0;
74 static uint64_t fq_if_def_l4s_target_qdelay = 0;
75 static uint64_t fq_if_def_l4s_update_interval = 0;
76 static uint64_t fq_if_ll_c_target_qdelay = 0;
77 static uint64_t fq_if_ll_c_update_interval = 0;
78 static uint64_t fq_if_ll_l4s_target_qdelay = 0;
79 static uint64_t fq_if_ll_l4s_update_interval = 0;
80 
81 uint32_t fq_codel_quantum = 0;
82 
83 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
84 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
85 
86 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
87     0, "FQ-CODEL parameters");
88 
89 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
90 #if (DEVELOPMENT || DEBUG)
91 SYSCTL_EXTENSIBLE_NODE(_net_classq_fq_codel, OID_AUTO, params,
92     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "classq fq codel parameters");
93 
94 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
95     CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
96 
97 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED,
98     &fq_if_def_c_target_qdelay, "classic target queue delay in nanoseconds");
99 
100 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, update_interval,
101     CTLFLAG_RW | CTLFLAG_LOCKED, &fq_if_def_c_update_interval,
102     "classic update interval in nanoseconds");
103 #endif /* !DEVELOPMENT && !DEBUG */
104 
105 unsigned int fq_codel_enable_pacing = 1;
106 SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
107     &fq_codel_enable_pacing, 0, "Enable pacing");
108 
109 uint32_t fq_codel_enable_l4s = 1;
110 SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_l4s,
111     CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_l4s, 0,
112     "enable/disable L4S");
113 
114 uint32_t fq_codel_enable_ecn = 0;
115 SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_ecn,
116     CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_ecn, 0,
117     "enable/disable ECN for classic traffic");
118 
119 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
120 
121 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
122 static void fq_if_destroy(fq_if_t *fqs);
123 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
124     uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
125 static void fq_if_dequeue_class(fq_if_t *, fq_if_classq_t *, uint32_t,
126     int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
127     uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
128 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
129 static void fq_if_purge(fq_if_t *);
130 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
131 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
132     uint64_t);
133 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
134 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
135     fq_t *fq, uint64_t now);
136 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
137 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
138     bool purge_all);
139 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
140 static int fq_if_dequeue_sc_separate(struct ifclassq *ifq,
141     mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
142     classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
143     u_int32_t *retbytecnt, uint8_t grp_idx);
144 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
145     cqrq_stat_sc_t *stat, uint64_t now);
146 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
147 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
148 static void fq_if_destroy_grps(fq_if_t *fqs);
149 static void fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx);
150 static void fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx);
151 static void fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay,
152     uint32_t flags);
153 static void fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags);
154 
155 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
156 	[FQ_IF_CTL_INDEX]       = 8,
157 	[FQ_IF_VO_INDEX]        = 8,
158 	[FQ_IF_VI_INDEX]        = 6,
159 	[FQ_IF_RV_INDEX]        = 6,
160 	[FQ_IF_AV_INDEX]        = 6,
161 	[FQ_IF_OAM_INDEX]       = 4,
162 	[FQ_IF_RD_INDEX]        = 4,
163 	[FQ_IF_BE_INDEX]        = 4,
164 	[FQ_IF_BK_INDEX]        = 2,
165 	[FQ_IF_BK_SYS_INDEX]    = 2,
166 };
167 
168 #define FQ_CODEL_DRR_MAX(_s)    fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
169 
170 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
171     fq_if_state state);
172 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
173     fq_if_state dst_state, fq_if_state src_state);
174 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
175     fq_if_state state);
176 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
177     fq_if_state state, fq_if_group_t **selected_grp);
178 static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
179     fq_if_state dst_state, fq_if_state src_state);
180 
181 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
182     fq_if_state state);
183 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
184     fq_if_state dst_state, fq_if_state src_state);
185 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
186     fq_if_state state);
187 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
188     fq_if_state state, fq_if_group_t **selected_grp);
189 static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
190     fq_if_state dst_state, fq_if_state src_state);
191 
192 void fq_if_teardown(struct ifclassq *ifq);
193 int fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg);
194 int fq_if_getqstats(struct ifclassq *ifq, uint8_t gid,
195     u_int32_t qid, struct if_ifclassq_stats *ifqs);
196 int fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *h,
197     classq_pkt_t *t, uint32_t cnt, uint32_t bytes, boolean_t *pdrop);
198 int fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt,
199     u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet,
200     u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx);
201 int fq_if_dequeue_sc(struct ifclassq *ifq,
202     mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
203     classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
204     u_int32_t *retbytecnt, uint8_t grp_idx);
205 int fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags,
206     classq_pkt_type_t ptype);
207 boolean_t fq_if_allow_dequeue(struct ifclassq *ifq);
208 
209 int fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags,
210     classq_pkt_type_t ptype);
211 
212 bitmap_ops_t fq_if_grps_bitmap_ops =
213 {
214 	.ffs    = fq_if_grps_bitmap_ffs,
215 	.zeros  = fq_if_grps_bitmap_zeros,
216 	.cpy    = fq_if_grps_bitmap_cpy,
217 	.clr    = fq_if_grps_bitmap_clr,
218 	.move   = fq_if_grps_bitmap_move,
219 };
220 
221 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
222 {
223 	.ffs    = fq_if_grps_sc_bitmap_ffs,
224 	.zeros  = fq_if_grps_sc_bitmap_zeros,
225 	.cpy    = fq_if_grps_sc_bitmap_cpy,
226 	.clr    = fq_if_grps_sc_bitmap_clr,
227 	.move   = fq_if_grps_sc_bitmap_move,
228 };
229 
230 static uint32_t fq_if_hash_table_size;
231 
232 struct pktsched_ops fq_codel_classq_ops = {
233 	.ps_id             = PKTSCHEDT_FQ_CODEL,
234 	.ps_setup          = fq_if_setup_legacy,
235 	.ps_teardown       = fq_if_teardown,
236 	.ps_enq            = fq_if_enqueue,
237 	.ps_deq            = fq_if_dequeue,
238 	.ps_deq_sc         = fq_if_dequeue_sc,
239 	.ps_req            = fq_if_request,
240 	.ps_stats          = fq_if_getqstats,
241 	.ps_allow_dequeue  = fq_if_allow_dequeue,
242 };
243 
244 struct pktsched_ops new_fq_codel_classq_ops = {
245 	.ps_id             = PKTSCHEDT_FQ_CODEL_NEW,
246 	.ps_setup          = fq_if_setup_new,
247 	.ps_teardown       = fq_if_teardown,
248 	.ps_enq            = fq_if_enqueue,
249 	.ps_deq            = fq_if_dequeue,
250 	.ps_deq_sc         = fq_if_dequeue_sc,
251 	.ps_req            = fq_if_request,
252 	.ps_stats          = fq_if_getqstats,
253 	.ps_allow_dequeue  = fq_if_allow_dequeue,
254 };
255 
256 void
pktsched_fq_init(void)257 pktsched_fq_init(void)
258 {
259 	pktsched_ops_register(&fq_codel_classq_ops);
260 	pktsched_ops_register(&new_fq_codel_classq_ops);
261 
262 	if (serverperfmode) {
263 		fq_if_hash_table_size = (1 << 16);
264 	} else {
265 		fq_if_hash_table_size = (1 << 8);
266 	}
267 
268 	// format looks like ifcq_drr_max=8,8,6
269 	char buf[(FQ_IF_MAX_CLASSES) * 3];
270 	size_t i, len, pri_index = 0;
271 	uint32_t drr = 0;
272 	if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
273 		return;
274 	}
275 
276 	len = strbuflen(buf, sizeof(buf));
277 	for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
278 		if (buf[i] != ',' && buf[i] != '\0') {
279 			VERIFY(buf[i] >= '0' && buf[i] <= '9');
280 			drr = drr * 10 + buf[i] - '0';
281 			continue;
282 		}
283 		fq_codel_drr_max_values[pri_index] = drr;
284 		pri_index += 1;
285 		drr = 0;
286 	}
287 
288 #if DEBUG || DEVELOPMENT
289 	PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum,
290 	    sizeof(fq_codel_quantum));
291 	PE_parse_boot_argn("fq_if_def_c_target_qdelay", &fq_if_def_c_target_qdelay,
292 	    sizeof(fq_if_def_c_target_qdelay));
293 	PE_parse_boot_argn("fq_if_def_c_update_interval",
294 	    &fq_if_def_c_update_interval, sizeof(fq_if_def_c_update_interval));
295 	PE_parse_boot_argn("fq_if_def_l4s_target_qdelay", &fq_if_def_l4s_target_qdelay,
296 	    sizeof(fq_if_def_l4s_target_qdelay));
297 	PE_parse_boot_argn("fq_if_def_l4s_update_interval",
298 	    &fq_if_def_l4s_update_interval, sizeof(fq_if_def_l4s_update_interval));
299 	PE_parse_boot_argn("fq_if_ll_c_target_qdelay", &fq_if_ll_c_target_qdelay,
300 	    sizeof(fq_if_ll_c_target_qdelay));
301 	PE_parse_boot_argn("fq_if_ll_c_update_interval",
302 	    &fq_if_ll_c_update_interval, sizeof(fq_if_ll_c_update_interval));
303 	PE_parse_boot_argn("fq_if_ll_l4s_target_qdelay", &fq_if_ll_l4s_target_qdelay,
304 	    sizeof(fq_if_ll_l4s_target_qdelay));
305 	PE_parse_boot_argn("fq_if_ll_l4s_update_interval",
306 	    &fq_if_ll_l4s_update_interval, sizeof(fq_if_ll_l4s_update_interval));
307 #endif /* DEBUG || DEVELOPMENT */
308 
309 	PE_parse_boot_argn("fq_codel_enable_pacing", &fq_codel_enable_pacing,
310 	    sizeof(fq_codel_enable_pacing));
311 
312 	fq_codel_init();
313 }
314 
315 static uint32_t
fq_if_flow_hash_id(uint32_t flowid)316 fq_if_flow_hash_id(uint32_t flowid)
317 {
318 	return flowid & (fq_if_hash_table_size - 1);
319 }
320 
321 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
322 	(STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
323 	STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
324 
325 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
326 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
327     int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
328     uint32_t *, boolean_t *, uint64_t);
329 
330 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)331 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
332 {
333 	pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
334 }
335 
336 static inline uint64_t
fq_codel_get_time(void)337 fq_codel_get_time(void)
338 {
339 	struct timespec ts;
340 	uint64_t now;
341 
342 	nanouptime(&ts);
343 	now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
344 	return now;
345 }
346 
347 #if SKYWALK
348 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)349 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
350 {
351 	pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
352 }
353 #endif /* SKYWALK */
354 
355 #if SKYWALK
356 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)357 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
358     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
359     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
360     boolean_t *qempty, uint64_t now)
361 {
362 	uint32_t plen;
363 	pktsched_pkt_t pkt;
364 	boolean_t limit_reached = FALSE;
365 	struct ifclassq *ifq = fqs->fqs_ifq;
366 	struct ifnet *ifp = ifq->ifcq_ifp;
367 
368 	/*
369 	 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
370 	 * all common flags need to be declared in that mask.
371 	 */
372 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
373 	    !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
374 		_PKTSCHED_PKT_INIT(&pkt);
375 		fqs->fqs_dequeue(fqs, fq, &pkt, now);
376 		if (pkt.pktsched_pcnt == 0) {
377 			continue;
378 		}
379 
380 		ASSERT(pkt.pktsched_ptype == QP_PACKET);
381 
382 		plen = pktsched_get_pkt_len(&pkt);
383 		fq->fq_deficit -= plen;
384 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
385 			pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
386 			fq->fq_flags &= ~FQF_FRESH_FLOW;
387 		}
388 
389 		if (head->cp_kpkt == NULL) {
390 			*head = pkt.pktsched_pkt;
391 		} else {
392 			ASSERT(tail->cp_kpkt != NULL);
393 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
394 			tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
395 		}
396 		*tail = pkt.pktsched_pkt;
397 		tail->cp_kpkt->pkt_nextpkt = NULL;
398 		fq_cl->fcl_stat.fcl_dequeue++;
399 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
400 		*pkt_cnt += 1;
401 		*byte_cnt += plen;
402 
403 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
404 
405 		/* Check if the limit is reached */
406 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
407 			limit_reached = TRUE;
408 		}
409 	}
410 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
411 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
412 	    fq->fq_bytes, fq->fq_min_qdelay);
413 
414 	*qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
415 	return limit_reached;
416 }
417 #endif /* SKYWALK */
418 
419 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)420 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
421     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
422     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
423     boolean_t *qempty, uint64_t now)
424 {
425 	u_int32_t plen;
426 	pktsched_pkt_t pkt;
427 	boolean_t limit_reached = FALSE;
428 	struct ifclassq *ifq = fqs->fqs_ifq;
429 	struct ifnet *ifp = ifq->ifcq_ifp;
430 
431 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
432 	    !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
433 		_PKTSCHED_PKT_INIT(&pkt);
434 		fqs->fqs_dequeue(fqs, fq, &pkt, now);
435 		if (pkt.pktsched_pcnt == 0) {
436 			continue;
437 		}
438 
439 		ASSERT(pkt.pktsched_ptype == QP_MBUF);
440 
441 		plen = pktsched_get_pkt_len(&pkt);
442 		fq->fq_deficit -= plen;
443 
444 		if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
445 			pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
446 			fq->fq_flags &= ~FQF_FRESH_FLOW;
447 		}
448 
449 		if (head->cp_mbuf == NULL) {
450 			*head = pkt.pktsched_pkt;
451 		} else {
452 			ASSERT(tail->cp_mbuf != NULL);
453 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
454 			tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
455 		}
456 		*tail = pkt.pktsched_pkt;
457 		tail->cp_mbuf->m_nextpkt = NULL;
458 		fq_cl->fcl_stat.fcl_dequeue++;
459 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
460 		*pkt_cnt += 1;
461 		*byte_cnt += plen;
462 
463 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
464 
465 		/* Check if the limit is reached */
466 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
467 			limit_reached = TRUE;
468 		}
469 	}
470 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
471 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
472 	    fq->fq_bytes, fq->fq_min_qdelay);
473 
474 	*qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
475 	return limit_reached;
476 }
477 
478 static void
fq_if_pacemaker_tcall(thread_call_param_t arg0,thread_call_param_t arg1)479 fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
480 {
481 #pragma unused(arg1)
482 	struct ifnet* ifp = (struct ifnet*)arg0;
483 	ASSERT(ifp != NULL);
484 
485 	ifnet_start_ignore_delay(ifp);
486 }
487 
488 static void
fq_if_calc_target_qdelay(struct ifnet * ifp,uint64_t * if_target_qdelay,uint32_t flags)489 fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay,
490     uint32_t flags)
491 {
492 	uint64_t qdelay = 0, qdelay_configed = 0, qdely_default = 0;
493 	if (flags == IF_CLASSQ_DEF) {
494 		qdelay = IFCQ_TARGET_QDELAY(ifp->if_snd);
495 	}
496 
497 	switch (flags) {
498 	case IF_CLASSQ_DEF:
499 		qdelay_configed = fq_if_def_c_target_qdelay;
500 		qdely_default = IFQ_DEF_C_TARGET_DELAY;
501 		break;
502 	case IF_CLASSQ_L4S:
503 		qdelay_configed = fq_if_def_l4s_target_qdelay;
504 		if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI ||
505 		    ifp->if_family == IFNET_FAMILY_CELLULAR) {
506 			qdely_default = IFQ_DEF_L4S_WIRELESS_TARGET_DELAY;
507 		} else {
508 			qdely_default = IFQ_DEF_L4S_TARGET_DELAY;
509 		}
510 		break;
511 	case IF_CLASSQ_LOW_LATENCY:
512 		qdelay_configed = fq_if_ll_c_target_qdelay;
513 		qdely_default = IFQ_LL_C_TARGET_DELAY;
514 		break;
515 	case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S):
516 		qdelay_configed = fq_if_ll_l4s_target_qdelay;
517 		if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI ||
518 		    ifp->if_family == IFNET_FAMILY_CELLULAR) {
519 			qdely_default = IFQ_LL_L4S_WIRELESS_TARGET_DELAY;
520 		} else {
521 			qdely_default = IFQ_LL_L4S_TARGET_DELAY;
522 		}
523 		break;
524 	default:
525 		VERIFY(0);
526 		/* NOTREACHED */
527 		__builtin_unreachable();
528 	}
529 
530 	if (qdelay_configed != 0) {
531 		qdelay = qdelay_configed;
532 	}
533 
534 	/*
535 	 * If we do not know the effective bandwidth, use the default
536 	 * target queue delay.
537 	 */
538 	if (qdelay == 0) {
539 		qdelay = qdely_default;
540 	}
541 
542 	/*
543 	 * If a delay has been added to ifnet start callback for
544 	 * coalescing, we have to add that to the pre-set target delay
545 	 * because the packets can be in the queue longer.
546 	 */
547 	if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
548 	    ifp->if_start_delay_timeout > 0) {
549 		qdelay += ifp->if_start_delay_timeout;
550 	}
551 
552 	*(if_target_qdelay) = qdelay;
553 }
554 
555 static void
fq_if_calc_update_interval(uint64_t * update_interval,uint32_t flags)556 fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags)
557 {
558 	uint64_t interval = 0, interval_configed = 0, interval_default = 0;
559 
560 	switch (flags) {
561 	case IF_CLASSQ_DEF:
562 		interval_configed = fq_if_def_c_update_interval;
563 		interval_default = IFQ_DEF_C_UPDATE_INTERVAL;
564 		break;
565 	case IF_CLASSQ_L4S:
566 		interval_configed = fq_if_def_l4s_update_interval;
567 		interval_default = IFQ_DEF_L4S_UPDATE_INTERVAL;
568 		break;
569 	case IF_CLASSQ_LOW_LATENCY:
570 		interval_configed = fq_if_ll_c_update_interval;
571 		interval_default = IFQ_LL_C_UPDATE_INTERVAL;
572 		break;
573 	case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S):
574 		interval_configed = fq_if_ll_l4s_update_interval;
575 		interval_default = IFQ_LL_L4S_UPDATE_INTERVAL;
576 		break;
577 	default:
578 		VERIFY(0);
579 		/* NOTREACHED */
580 		__builtin_unreachable();
581 	}
582 
583 	/* If the system level override is set, use it */
584 	if (interval_configed != 0) {
585 		interval = interval_configed;
586 	}
587 
588 	/* Otherwise use the default value */
589 	if (interval == 0) {
590 		interval = interval_default;
591 	}
592 
593 	*update_interval = interval;
594 }
595 
596 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)597 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
598 {
599 	flowq_list_t *fqs_flows;
600 	fq_if_t *fqs;
601 
602 	ASSERT(ifq->ifcq_ifp != NULL);
603 	fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
604 	if (fqs == NULL) {
605 		return NULL;
606 	}
607 	fqs_flows = kalloc_type(flowq_list_t, fq_if_hash_table_size, Z_WAITOK | Z_ZERO);
608 	if (fqs_flows == NULL) {
609 		zfree(fq_if_zone, fqs);
610 		return NULL;
611 	}
612 	fqs->fqs_flows = fqs_flows;
613 	fqs->fqs_flows_count = fq_if_hash_table_size;
614 	fqs->fqs_ifq = ifq;
615 	fqs->fqs_ptype = ptype;
616 
617 	/* Configure packet drop limit across all queues */
618 	fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
619 	STAILQ_INIT(&fqs->fqs_fclist);
620 	TAILQ_INIT(&fqs->fqs_empty_list);
621 	TAILQ_INIT(&fqs->fqs_combined_grp_list);
622 	fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall,
623 	    (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL,
624 	    THREAD_CALL_OPTIONS_ONCE);
625 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
626 
627 	return fqs;
628 }
629 
630 void
fq_if_destroy(fq_if_t * fqs)631 fq_if_destroy(fq_if_t *fqs)
632 {
633 	struct ifnet    *ifp = fqs->fqs_ifq->ifcq_ifp;
634 	thread_call_t   __single tcall = fqs->fqs_pacemaker_tcall;
635 
636 	VERIFY(ifp != NULL);
637 	ASSERT(tcall != NULL);
638 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
639 	LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
640 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
641 
642 	/*
643 	 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
644 	 * and schedule a pacemaker call. So we do not need a sleep wait loop here
645 	 * cancel wait and free should succeed in one call.
646 	 */
647 	thread_call_cancel_wait(tcall);
648 	ASSERT(thread_call_free(tcall));
649 
650 	fq_if_purge(fqs);
651 	fq_if_destroy_grps(fqs);
652 
653 	fqs->fqs_ifq = NULL;
654 
655 #if (DEBUG || DEVELOPMENT)
656 	struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid;
657 	skoid_destroy(fqs_skoid);
658 #endif /* (DEBUG || DEVELOPMENT) */
659 
660 	kfree_type_counted_by(flowq_list_t, fqs->fqs_flows_count, fqs->fqs_flows);
661 	zfree(fq_if_zone, fqs);
662 }
663 
664 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)665 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
666 {
667 	uint8_t pri;
668 
669 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
670 		switch (svc) {
671 		case MBUF_SC_BK_SYS:
672 		case MBUF_SC_BK:
673 			pri = FQ_IF_BK_INDEX;
674 			break;
675 		case MBUF_SC_BE:
676 		case MBUF_SC_RD:
677 		case MBUF_SC_OAM:
678 			pri = FQ_IF_BE_INDEX;
679 			break;
680 		case MBUF_SC_AV:
681 		case MBUF_SC_RV:
682 		case MBUF_SC_VI:
683 		case MBUF_SC_SIG:
684 			pri = FQ_IF_VI_INDEX;
685 			break;
686 		case MBUF_SC_VO:
687 		case MBUF_SC_CTL:
688 			pri = FQ_IF_VO_INDEX;
689 			break;
690 		default:
691 			pri = FQ_IF_BE_INDEX; /* Use best effort by default */
692 			break;
693 		}
694 		return pri;
695 	}
696 
697 	/* scheduler is not managed by the driver */
698 	switch (svc) {
699 	case MBUF_SC_BK_SYS:
700 		pri = FQ_IF_BK_SYS_INDEX;
701 		break;
702 	case MBUF_SC_BK:
703 		pri = FQ_IF_BK_INDEX;
704 		break;
705 	case MBUF_SC_BE:
706 		pri = FQ_IF_BE_INDEX;
707 		break;
708 	case MBUF_SC_RD:
709 		pri = FQ_IF_RD_INDEX;
710 		break;
711 	case MBUF_SC_OAM:
712 		pri = FQ_IF_OAM_INDEX;
713 		break;
714 	case MBUF_SC_AV:
715 		pri = FQ_IF_AV_INDEX;
716 		break;
717 	case MBUF_SC_RV:
718 		pri = FQ_IF_RV_INDEX;
719 		break;
720 	case MBUF_SC_VI:
721 		pri = FQ_IF_VI_INDEX;
722 		break;
723 	case MBUF_SC_SIG:
724 		pri = FQ_IF_SIG_INDEX;
725 		break;
726 	case MBUF_SC_VO:
727 		pri = FQ_IF_VO_INDEX;
728 		break;
729 	case MBUF_SC_CTL:
730 		pri = FQ_IF_CTL_INDEX;
731 		break;
732 	default:
733 		pri = FQ_IF_BE_INDEX; /* Use best effort by default */
734 		break;
735 	}
736 	return pri;
737 }
738 
739 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)740 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
741     uint32_t drr_max, uint32_t svc_class)
742 {
743 	fq_if_classq_t *fq_cl;
744 	VERIFY(pri < FQ_IF_MAX_CLASSES);
745 	fq_cl = &fqg->fqg_classq[pri];
746 
747 	VERIFY(fq_cl->fcl_quantum == 0);
748 	VERIFY(quantum != 0);
749 	fq_cl->fcl_quantum = quantum;
750 	fq_cl->fcl_pri = pri;
751 	fq_cl->fcl_drr_max = drr_max;
752 	fq_cl->fcl_service_class = svc_class;
753 	fq_cl->fcl_next_tx_time = 0;
754 	fq_cl->fcl_flags = 0;
755 	STAILQ_INIT(&fq_cl->fcl_new_flows);
756 	STAILQ_INIT(&fq_cl->fcl_old_flows);
757 }
758 
759 int
fq_if_enqueue(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)760 fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *head,
761     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
762 {
763 	uint8_t pri, grp_idx = 0;
764 	fq_if_t *fqs;
765 	fq_if_classq_t *fq_cl;
766 	fq_if_group_t *fq_group;
767 	int ret;
768 	mbuf_svc_class_t svc;
769 	pktsched_pkt_t pkt;
770 
771 	pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
772 
773 	IFCQ_LOCK_SPIN(ifq);
774 	fqs = (fq_if_t *)ifq->ifcq_disc;
775 	svc = pktsched_get_pkt_svc(&pkt);
776 #if SKYWALK
777 	if (head->cp_ptype == QP_PACKET) {
778 		grp_idx = head->cp_kpkt->pkt_qset_idx;
779 	}
780 #endif /* SKYWALK */
781 	pri = fq_if_service_to_priority(fqs, svc);
782 	VERIFY(pri < FQ_IF_MAX_CLASSES);
783 
784 	fq_group = fq_if_find_grp(fqs, grp_idx);
785 	fq_cl = &fq_group->fqg_classq[pri];
786 
787 	if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
788 		IFCQ_UNLOCK(ifq);
789 		/* BK_SYS is currently throttled */
790 		os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
791 		if (__improbable(droptap_verbose > 0)) {
792 			pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
793 			    __func__, __LINE__, 0);
794 		} else {
795 			pktsched_free_pkt(&pkt);
796 		}
797 		*pdrop = TRUE;
798 		ret = EQSUSPENDED;
799 		goto done;
800 	}
801 
802 	ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
803 	ret = fqs->fqs_enqueue(fqs, fq_group, &pkt, fq_cl);
804 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
805 		if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
806 		    (1 << pri)) == 0) {
807 			/*
808 			 * this group is not in ER or EB groups,
809 			 * mark it as IB
810 			 */
811 			pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
812 		}
813 	}
814 
815 	if (__improbable(ret != 0)) {
816 		if (ret == CLASSQEQ_SUCCESS_FC) {
817 			/* packet enqueued, return advisory feedback */
818 			ret = EQFULL;
819 			*pdrop = FALSE;
820 		} else if (ret == CLASSQEQ_COMPRESSED) {
821 			ret = 0;
822 			*pdrop = FALSE;
823 		} else if (ret == CLASSQEQ_CONGESTED) {
824 			ret = EQCONGESTED;
825 			*pdrop = FALSE;
826 		} else {
827 			IFCQ_UNLOCK(ifq);
828 			*pdrop = TRUE;
829 			pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_FULL,
830 			    __func__, __LINE__, 0);
831 			switch (ret) {
832 			case CLASSQEQ_DROP:
833 				ret = ENOBUFS;
834 				goto done;
835 			case CLASSQEQ_DROP_FC:
836 				ret = EQFULL;
837 				goto done;
838 			case CLASSQEQ_DROP_SP:
839 				ret = EQSUSPENDED;
840 				goto done;
841 			default:
842 				VERIFY(0);
843 				/* NOTREACHED */
844 				__builtin_unreachable();
845 			}
846 			/* NOTREACHED */
847 			__builtin_unreachable();
848 		}
849 	} else {
850 		*pdrop = FALSE;
851 	}
852 	IFCQ_ADD_LEN(ifq, cnt);
853 	IFCQ_INC_BYTES(ifq, bytes);
854 
855 
856 	FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
857 	FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
858 
859 	IFCQ_UNLOCK(ifq);
860 done:
861 #if DEBUG || DEVELOPMENT
862 	if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
863 		ret = 0;
864 	}
865 #endif /* DEBUG || DEVELOPMENT */
866 	return ret;
867 }
868 
869 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)870 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
871 {
872 	ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
873 	ASSERT(!fq->fq_in_dqlist);
874 	STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
875 	fq->fq_in_dqlist = true;
876 }
877 
878 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)879 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
880     classq_pkt_t *tail, classq_pkt_type_t ptype)
881 {
882 	ASSERT(fq->fq_in_dqlist);
883 	if (fq->fq_dq_head.cp_mbuf == NULL) {
884 		goto done;
885 	}
886 
887 	if (head->cp_mbuf == NULL) {
888 		*head = fq->fq_dq_head;
889 	} else {
890 		ASSERT(tail->cp_mbuf != NULL);
891 
892 		switch (ptype) {
893 		case QP_MBUF:
894 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
895 			tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
896 			ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
897 			break;
898 #if SKYWALK
899 		case QP_PACKET:
900 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
901 			tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
902 			ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
903 			break;
904 #endif /* SKYWALK */
905 		default:
906 			VERIFY(0);
907 			/* NOTREACHED */
908 			__builtin_unreachable();
909 		}
910 	}
911 	*tail = fq->fq_dq_tail;
912 done:
913 	STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
914 	CLASSQ_PKT_INIT(&fq->fq_dq_head);
915 	CLASSQ_PKT_INIT(&fq->fq_dq_tail);
916 	fq->fq_in_dqlist = false;
917 }
918 
919 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)920 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
921     classq_pkt_t *tail, classq_pkt_type_t ptype)
922 {
923 	fq_t *fq, *tfq;
924 
925 	STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
926 		fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
927 	}
928 }
929 
930 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)931 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
932     fq_if_group_t **selected_grp)
933 {
934 	#pragma unused(pri)
935 
936 	fq_if_group_t *grp;
937 	uint32_t highest_pri = FQ_IF_MAX_CLASSES;
938 	int ret_pri = 0;
939 
940 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
941 		uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
942 		/* bitmap is empty in this case */
943 		if (cur_pri == 0) {
944 			continue;
945 		}
946 		if (cur_pri <= highest_pri) {
947 			highest_pri = cur_pri;
948 			ret_pri = cur_pri;
949 			*selected_grp = grp;
950 		}
951 	}
952 	return ret_pri;
953 }
954 
955 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)956 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
957 {
958     #pragma unused(pri)
959 
960 	fq_if_group_t *grp;
961 
962 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
963 		if (grp->fqg_bitmaps[state] != 0) {
964 			return FALSE;
965 		}
966 	}
967 	return TRUE;
968 }
969 
970 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)971 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
972     fq_if_state src_state)
973 {
974     #pragma unused(pri)
975 
976 	fq_if_group_t *grp;
977 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
978 		grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
979 	}
980 }
981 
982 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)983 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
984 {
985     #pragma unused(pri)
986 
987 	fq_if_group_t *grp;
988 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
989 		grp->fqg_bitmaps[state] = 0;
990 	}
991 }
992 
993 static void
fq_if_grps_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)994 fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
995     fq_if_state src_state)
996 {
997     #pragma unused(pri)
998 
999 	fq_if_group_t *grp;
1000 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1001 		grp->fqg_bitmaps[dst_state] =
1002 		    grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
1003 		grp->fqg_bitmaps[src_state] = 0;
1004 	}
1005 }
1006 
1007 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)1008 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
1009     fq_if_group_t **selected_grp)
1010 {
1011 	fq_if_group_t *grp;
1012 	int ret_pri = 0;
1013 
1014 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1015 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
1016 			/* +1 to match the semantics of pktsched_ffs */
1017 			ret_pri = pri + 1;
1018 			*selected_grp = grp;
1019 			break;
1020 		}
1021 	}
1022 
1023 	return ret_pri;
1024 }
1025 
1026 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)1027 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
1028 {
1029 	fq_if_group_t *grp;
1030 
1031 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1032 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
1033 			return FALSE;
1034 		}
1035 	}
1036 	return TRUE;
1037 }
1038 
1039 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)1040 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
1041     fq_if_state src_state)
1042 {
1043 	fq_if_group_t *grp;
1044 
1045 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1046 		pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
1047 		    &grp->fqg_bitmaps[src_state]);
1048 	}
1049 }
1050 
1051 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)1052 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
1053 {
1054 	fq_if_group_t *grp;
1055 
1056 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1057 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
1058 	}
1059 }
1060 
1061 static void
fq_if_grps_sc_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)1062 fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
1063     fq_if_state src_state)
1064 {
1065 	fq_if_group_t *grp;
1066 
1067 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1068 		pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state],
1069 		    &grp->fqg_bitmaps[src_state]);
1070 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]);
1071 	}
1072 }
1073 
1074 /*
1075  * Pacemaker is only scheduled when no packet can be dequeued from AQM
1076  * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
1077  * This only applies to L4S traffic at this moment.
1078  */
1079 static void
fq_if_schedule_pacemaker(fq_if_t * fqs,uint64_t now,uint64_t next_tx_time)1080 fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
1081 {
1082 	uint64_t deadline = 0;
1083 	if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1084 		return;
1085 	}
1086 	ASSERT(next_tx_time != FQ_INVALID_TX_TS);
1087 	ASSERT(fqs->fqs_pacemaker_tcall != NULL);
1088 	ASSERT(now < next_tx_time);
1089 
1090 	DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
1091 	    uint64_t, next_tx_time - now);
1092 	KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
1093 	    next_tx_time, next_tx_time - now);
1094 
1095 	clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline);
1096 	thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline);
1097 }
1098 
1099 static int
fq_if_dequeue_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1100 fq_if_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
1101     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1102     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1103     uint8_t grp_idx)
1104 {
1105 	uint32_t total_pktcnt = 0, total_bytecnt = 0;
1106 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1107 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1108 	classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
1109 	fq_if_append_pkt_t append_pkt;
1110 	flowq_dqlist_t fq_dqlist_head;
1111 	fq_if_classq_t *fq_cl;
1112 	fq_grp_tailq_t *grp_list, tmp_grp_list;
1113 	fq_if_group_t *__single fq_grp = NULL;
1114 	fq_if_t *fqs;
1115 	uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
1116 	int pri = 0, svc_pri = 0;
1117 	bool all_paced = true;
1118 
1119 	IFCQ_LOCK_ASSERT_HELD(ifq);
1120 
1121 	fqs = (fq_if_t *)ifq->ifcq_disc;
1122 	STAILQ_INIT(&fq_dqlist_head);
1123 
1124 	switch (fqs->fqs_ptype) {
1125 	case QP_MBUF:
1126 		append_pkt = fq_if_append_mbuf;
1127 		break;
1128 
1129 #if SKYWALK
1130 	case QP_PACKET:
1131 		append_pkt = fq_if_append_pkt;
1132 		break;
1133 #endif /* SKYWALK */
1134 
1135 	default:
1136 		VERIFY(0);
1137 		/* NOTREACHED */
1138 		__builtin_unreachable();
1139 	}
1140 
1141 	now = fq_codel_get_time();
1142 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
1143 		svc_pri = fq_if_service_to_priority(fqs, svc);
1144 	} else {
1145 		VERIFY(svc == MBUF_SC_UNSPEC);
1146 	}
1147 
1148 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
1149 		grp_list = &fqs->fqs_combined_grp_list;
1150 		VERIFY(!TAILQ_EMPTY(grp_list));
1151 	} else {
1152 		grp_list = &tmp_grp_list;
1153 		fq_grp = fq_if_find_grp(fqs, grp_idx);
1154 		TAILQ_INIT(grp_list);
1155 		TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
1156 	}
1157 
1158 	for (;;) {
1159 		uint32_t pktcnt = 0, bytecnt = 0;
1160 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1161 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1162 		bool fq_cl_all_paced = false;
1163 		uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
1164 
1165 		if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
1166 		    fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
1167 			fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
1168 			fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
1169 			if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
1170 				if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1171 					/*
1172 					 * Move fq_cl in IR back to ER, so that they will inspected with priority
1173 					 * the next time the driver dequeues
1174 					 */
1175 					fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1176 					fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
1177 				}
1178 				break;
1179 			}
1180 		}
1181 		pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
1182 		if (pri == 0) {
1183 			/*
1184 			 * There are no ER flows, move the highest
1185 			 * priority one from EB if there are any in that
1186 			 * category
1187 			 */
1188 			pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
1189 			VERIFY(pri > 0);
1190 			VERIFY(fq_grp != NULL);
1191 			pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1192 			pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1193 		}
1194 		VERIFY(fq_grp != NULL);
1195 		pri--; /* index starts at 0 */
1196 		fq_cl = &fq_grp->fqg_classq[pri];
1197 
1198 		if (fq_cl->fcl_budget <= 0) {
1199 			/* Update the budget */
1200 			fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
1201 			    fq_cl->fcl_stat.fcl_flows_cnt) *
1202 			    fq_cl->fcl_quantum);
1203 			if (fq_cl->fcl_budget <= 0) {
1204 				goto state_change;
1205 			}
1206 		}
1207 		fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1208 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1209 		    &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
1210 		    &fq_cl_next_tx_time);
1211 		if (head.cp_mbuf != NULL) {
1212 			ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
1213 			if (first.cp_mbuf == NULL) {
1214 				first = head;
1215 			} else {
1216 				ASSERT(last.cp_mbuf != NULL);
1217 				append_pkt(&last, &head);
1218 			}
1219 			last = tail;
1220 			append_pkt(&last, &tmp);
1221 		}
1222 		if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
1223 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1224 			next_tx_time = fq_cl_next_tx_time;
1225 		}
1226 		fq_cl->fcl_budget -= bytecnt;
1227 		total_pktcnt += pktcnt;
1228 		total_bytecnt += bytecnt;
1229 
1230 		/*
1231 		 * If the class has exceeded the budget but still has data
1232 		 * to send, move it to IB
1233 		 */
1234 state_change:
1235 		VERIFY(fq_grp != NULL);
1236 		all_paced &= fq_cl_all_paced;
1237 		if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1238 			if (fq_cl->fcl_budget <= 0) {
1239 				pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1240 				pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1241 			} else if (fq_cl_all_paced) {
1242 				if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1243 					/*
1244 					 * If a fq_cl still has budget but only paced queues, park it
1245 					 * to IR so that we will not keep loopping over it
1246 					 */
1247 					pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]);
1248 					pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1249 				}
1250 			}
1251 		} else {
1252 			pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1253 			VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1254 			    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1255 			    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1256 			fq_cl->fcl_budget = 0;
1257 		}
1258 		if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1259 			if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1260 				/*
1261 				 * Move fq_cl in IR back to ER, so that they will inspected with priority
1262 				 * the next time the driver dequeues
1263 				 */
1264 				fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1265 			}
1266 			break;
1267 		}
1268 	}
1269 
1270 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1271 		TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1272 		VERIFY(TAILQ_EMPTY(grp_list));
1273 	}
1274 
1275 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
1276 	    fqs->fqs_ptype);
1277 
1278 	if (__probable(first_packet != NULL)) {
1279 		*first_packet = first;
1280 	}
1281 	if (last_packet != NULL) {
1282 		*last_packet = last;
1283 	}
1284 	if (retpktcnt != NULL) {
1285 		*retpktcnt = total_pktcnt;
1286 	}
1287 	if (retbytecnt != NULL) {
1288 		*retbytecnt = total_bytecnt;
1289 	}
1290 	if (next_tx_time != FQ_INVALID_TX_TS) {
1291 		ASSERT(next_tx_time > now);
1292 		fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1293 	}
1294 
1295 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1296 	fq_if_purge_empty_flow_list(fqs, now, false);
1297 	return 0;
1298 }
1299 
1300 int
fq_if_dequeue(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1301 fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt,
1302     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1303     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1304     u_int32_t *retbytecnt, uint8_t grp_idx)
1305 {
1306 	return fq_if_dequeue_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1307 	           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1308 }
1309 
1310 int
fq_if_dequeue_sc(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1311 fq_if_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t svc,
1312     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1313     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1314     uint8_t grp_idx)
1315 {
1316 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1317 
1318 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
1319 		return fq_if_dequeue_common(ifq, svc, maxpktcnt, maxbytecnt,
1320 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1321 	} else {
1322 		/*
1323 		 * take a shortcut here since there is no need to schedule
1324 		 * one single service class.
1325 		 */
1326 		return fq_if_dequeue_sc_separate(ifq, svc, maxpktcnt, maxbytecnt,
1327 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1328 	}
1329 }
1330 
1331 static int
fq_if_dequeue_sc_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1332 fq_if_dequeue_sc_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1333     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1334     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1335     uint8_t grp_idx)
1336 {
1337 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1338 	uint8_t pri;
1339 	u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1340 	fq_if_classq_t *fq_cl;
1341 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1342 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1343 	fq_if_append_pkt_t append_pkt;
1344 	flowq_dqlist_t fq_dqlist_head;
1345 	fq_if_group_t *fq_grp;
1346 	uint64_t now;
1347 
1348 	switch (fqs->fqs_ptype) {
1349 	case QP_MBUF:
1350 		append_pkt = fq_if_append_mbuf;
1351 		break;
1352 
1353 #if SKYWALK
1354 	case QP_PACKET:
1355 		append_pkt = fq_if_append_pkt;
1356 		break;
1357 #endif /* SKYWALK */
1358 
1359 	default:
1360 		VERIFY(0);
1361 		/* NOTREACHED */
1362 		__builtin_unreachable();
1363 	}
1364 
1365 	STAILQ_INIT(&fq_dqlist_head);
1366 	now = fq_codel_get_time();
1367 
1368 	pri = fq_if_service_to_priority(fqs, svc);
1369 	fq_grp = fq_if_find_grp(fqs, grp_idx);
1370 	fq_cl = &fq_grp->fqg_classq[pri];
1371 
1372 	/*
1373 	 * Now we have the queue for a particular service class. We need
1374 	 * to dequeue as many packets as needed, first from the new flows
1375 	 * and then from the old flows.
1376 	 */
1377 	while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1378 	    fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1379 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1380 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1381 		u_int32_t pktcnt = 0, bytecnt = 0;
1382 		bool all_paced = false;
1383 		uint64_t next_tx_time = FQ_INVALID_TX_TS;
1384 
1385 		fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1386 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1387 		    &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1388 		if (head.cp_mbuf != NULL) {
1389 			if (first.cp_mbuf == NULL) {
1390 				first = head;
1391 			} else {
1392 				ASSERT(last.cp_mbuf != NULL);
1393 				append_pkt(&last, &head);
1394 			}
1395 			last = tail;
1396 		}
1397 		total_pktcnt += pktcnt;
1398 		total_bytecnt += bytecnt;
1399 
1400 		if (next_tx_time != FQ_INVALID_TX_TS) {
1401 			ASSERT(next_tx_time > now);
1402 			fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1403 			fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1404 			break;
1405 		}
1406 	}
1407 
1408 	/*
1409 	 * Mark classq as IB if it's not idle, so that we can
1410 	 * start without re-init the bitmaps when it's switched
1411 	 * to combined mode.
1412 	 */
1413 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1414 		pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1415 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1416 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1417 	} else {
1418 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1419 		VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1420 		    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1421 		    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1422 	}
1423 
1424 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1425 
1426 	if (__probable(first_packet != NULL)) {
1427 		*first_packet = first;
1428 	}
1429 	if (last_packet != NULL) {
1430 		*last_packet = last;
1431 	}
1432 	if (retpktcnt != NULL) {
1433 		*retpktcnt = total_pktcnt;
1434 	}
1435 	if (retbytecnt != NULL) {
1436 		*retbytecnt = total_bytecnt;
1437 	}
1438 
1439 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1440 	fq_if_purge_empty_flow_list(fqs, now, false);
1441 	return 0;
1442 }
1443 
1444 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1445 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1446     uint32_t *bytesp, uint64_t now)
1447 {
1448 	fq_if_classq_t *fq_cl;
1449 	u_int32_t pkts, bytes;
1450 	pktsched_pkt_t pkt;
1451 	fq_if_group_t *grp;
1452 
1453 	fq_cl = &FQ_CLASSQ(fq);
1454 	grp = FQ_GROUP(fq);
1455 	pkts = bytes = 0;
1456 	_PKTSCHED_PKT_INIT(&pkt);
1457 	for (;;) {
1458 		fqs->fqs_dequeue(fqs, fq, &pkt, now);
1459 		if (pkt.pktsched_pkt_mbuf == NULL) {
1460 			VERIFY(pkt.pktsched_ptype == QP_INVALID);
1461 			break;
1462 		}
1463 		pkts++;
1464 		bytes += pktsched_get_pkt_len(&pkt);
1465 		if (__improbable(droptap_verbose > 0)) {
1466 			pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_PURGE_FLOW,
1467 			    __func__, __LINE__, 0);
1468 		} else {
1469 			pktsched_free_pkt(&pkt);
1470 		}
1471 	}
1472 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1473 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1474 
1475 	IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1476 
1477 	/* move through the flow queue states */
1478 	VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1479 	if (fq->fq_flags & FQF_NEW_FLOW) {
1480 		fq_if_empty_new_flow(fq, fq_cl);
1481 	}
1482 	if (fq->fq_flags & FQF_OLD_FLOW) {
1483 		fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1484 	}
1485 	if (fq->fq_flags & FQF_EMPTY_FLOW) {
1486 		fq_if_purge_empty_flow(fqs, fq);
1487 		fq = NULL;
1488 	}
1489 
1490 	if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1491 		int i;
1492 		for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1493 			pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1494 		}
1495 	}
1496 
1497 	if (pktsp != NULL) {
1498 		*pktsp = pkts;
1499 	}
1500 	if (bytesp != NULL) {
1501 		*bytesp = bytes;
1502 	}
1503 }
1504 
1505 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1506 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1507 {
1508 	fq_t *fq, *tfq;
1509 	uint64_t now;
1510 
1511 	now = fq_codel_get_time();
1512 	/*
1513 	 * Take each flow from new/old flow list and flush mbufs
1514 	 * in that flow
1515 	 */
1516 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1517 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1518 	}
1519 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1520 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1521 	}
1522 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1523 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1524 
1525 	STAILQ_INIT(&fq_cl->fcl_new_flows);
1526 	STAILQ_INIT(&fq_cl->fcl_old_flows);
1527 	fq_cl->fcl_budget = 0;
1528 }
1529 
1530 static void
fq_if_purge(fq_if_t * fqs)1531 fq_if_purge(fq_if_t *fqs)
1532 {
1533 	uint64_t now;
1534 	fq_if_group_t *grp;
1535 	int i;
1536 
1537 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1538 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1539 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1540 			continue;
1541 		}
1542 
1543 		grp = fq_if_find_grp(fqs, grp_idx);
1544 		fq_if_purge_grp(fqs, grp);
1545 	}
1546 
1547 	now = fq_codel_get_time();
1548 	fq_if_purge_empty_flow_list(fqs, now, true);
1549 
1550 	VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1551 	VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1552 
1553 	fqs->fqs_large_flow = NULL;
1554 	for (i = 0; i < fqs->fqs_flows_count; i++) {
1555 		VERIFY(LIST_EMPTY(&fqs->fqs_flows[i]));
1556 	}
1557 
1558 	IFCQ_LEN(fqs->fqs_ifq) = 0;
1559 	IFCQ_BYTES(fqs->fqs_ifq) = 0;
1560 }
1561 
1562 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1563 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1564 {
1565 	fq_t *fq;
1566 	uint64_t now;
1567 	fq_if_group_t *grp;
1568 
1569 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1570 	req->packets = req->bytes = 0;
1571 	VERIFY(req->flow != 0);
1572 
1573 	now = fq_codel_get_time();
1574 
1575 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1576 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1577 			continue;
1578 		}
1579 		uint32_t bytes = 0, pkts = 0;
1580 
1581 		grp = fq_if_find_grp(fqs, grp_idx);
1582 		/*
1583 		 * Packet and traffic type are needed only if we want
1584 		 * to create a flow queue.
1585 		 */
1586 		fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, 0, 0, FQ_TFC_C, false);
1587 		if (fq != NULL) {
1588 			fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1589 			req->bytes += bytes;
1590 			req->packets += pkts;
1591 		}
1592 	}
1593 }
1594 
1595 static uint32_t
fq_if_calc_quantum(struct ifnet * ifp)1596 fq_if_calc_quantum(struct ifnet *ifp)
1597 {
1598 	uint32_t quantum, hwassist_flags;
1599 
1600 	switch (ifp->if_family) {
1601 	case IFNET_FAMILY_ETHERNET:
1602 		VERIFY(ifp->if_mtu <= IF_MAXMTU);
1603 		quantum = ifp->if_mtu + ETHER_HDR_LEN;
1604 		break;
1605 
1606 	case IFNET_FAMILY_CELLULAR:
1607 	case IFNET_FAMILY_IPSEC:
1608 	case IFNET_FAMILY_UTUN:
1609 		VERIFY(ifp->if_mtu <= UINT16_MAX);
1610 		quantum = ifp->if_mtu;
1611 		break;
1612 
1613 	default:
1614 		quantum = FQ_CODEL_DEFAULT_QUANTUM;
1615 		break;
1616 	}
1617 
1618 	hwassist_flags = if_get_driver_hwassist(ifp);
1619 	if ((hwassist_flags & IFNET_TSOF) != 0) {
1620 		VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1621 		VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1622 		quantum = MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1623 		quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1624 	}
1625 
1626 	quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1627 #if DEBUG || DEVELOPMENT
1628 	quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1629 #endif /* DEBUG || DEVELOPMENT */
1630 	VERIFY(quantum != 0);
1631 	return quantum;
1632 }
1633 
1634 static void
fq_if_mtu_update(fq_if_t * fqs)1635 fq_if_mtu_update(fq_if_t *fqs)
1636 {
1637 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q)                     \
1638 	(_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum =        \
1639 	    FQ_CODEL_QUANTUM_ ## _s(_q)                                 \
1640 
1641 	uint32_t quantum;
1642 	fq_if_group_t *grp;
1643 
1644 	quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1645 
1646 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1647 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1648 			continue;
1649 		}
1650 
1651 		grp = fq_if_find_grp(fqs, grp_idx);
1652 
1653 		if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1654 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1655 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1656 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1657 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1658 		} else {
1659 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1660 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1661 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1662 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1663 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1664 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1665 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1666 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1667 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1668 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1669 		}
1670 	}
1671 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1672 }
1673 
1674 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1675 fq_if_event(fq_if_t *fqs, cqev_t ev)
1676 {
1677 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1678 
1679 	switch (ev) {
1680 	case CLASSQ_EV_LINK_UP:
1681 	case CLASSQ_EV_LINK_DOWN:
1682 		fq_if_purge(fqs);
1683 		break;
1684 	case CLASSQ_EV_LINK_MTU:
1685 		fq_if_mtu_update(fqs);
1686 		break;
1687 	default:
1688 		break;
1689 	}
1690 }
1691 
1692 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1693 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1694 {
1695 	fq_if_purge_classq(fqs, fq_cl);
1696 	fqs->fqs_throttle = 1;
1697 	fq_cl->fcl_stat.fcl_throttle_on++;
1698 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1699 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1700 }
1701 
1702 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1703 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1704 {
1705 	VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1706 	fqs->fqs_throttle = 0;
1707 	fq_cl->fcl_stat.fcl_throttle_off++;
1708 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1709 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1710 }
1711 
1712 
1713 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1714 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1715 {
1716 	struct ifclassq *ifq = fqs->fqs_ifq;
1717 	uint8_t index;
1718 	fq_if_group_t *grp;
1719 
1720 #if !MACH_ASSERT
1721 #pragma unused(ifq)
1722 #endif
1723 	IFCQ_LOCK_ASSERT_HELD(ifq);
1724 
1725 	if (!tr->set) {
1726 		tr->level = fqs->fqs_throttle;
1727 		return 0;
1728 	}
1729 
1730 	if (tr->level == fqs->fqs_throttle) {
1731 		return EALREADY;
1732 	}
1733 
1734 	/* Throttling is allowed on BK_SYS class only */
1735 	index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1736 
1737 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1738 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1739 			continue;
1740 		}
1741 		grp = fq_if_find_grp(fqs, grp_idx);
1742 		switch (tr->level) {
1743 		case IFNET_THROTTLE_OFF:
1744 			fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1745 			break;
1746 		case IFNET_THROTTLE_OPPORTUNISTIC:
1747 			fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1748 			break;
1749 		default:
1750 			break;
1751 		}
1752 	}
1753 	return 0;
1754 }
1755 
1756 static inline boolean_t
fq_if_is_fq_cl_paced(fq_if_classq_t * fq_cl,uint64_t now)1757 fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1758 {
1759 	if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1760 		return true;
1761 	}
1762 
1763 	fq_cl->fcl_flags &= ~FCL_PACED;
1764 	fq_cl->fcl_next_tx_time = 0;
1765 	return false;
1766 }
1767 
1768 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat,uint64_t now)1769 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1770 {
1771 	uint8_t pri;
1772 	fq_if_classq_t *fq_cl;
1773 
1774 	ASSERT(stat != NULL);
1775 	pri = fq_if_service_to_priority(fqs, stat->sc);
1776 
1777 	fq_cl = &grp->fqg_classq[pri];
1778 	stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1779 	stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1780 
1781 	if (fq_codel_enable_pacing && fq_codel_enable_l4s &&
1782 	    fq_if_is_fq_cl_paced(fq_cl, now)) {
1783 		stat->packets = 0;
1784 		stat->bytes = 0;
1785 	}
1786 }
1787 
1788 static boolean_t
fq_if_is_grp_all_paced(fq_if_group_t * grp)1789 fq_if_is_grp_all_paced(fq_if_group_t *grp)
1790 {
1791 	fq_if_classq_t *fq_cl;
1792 	uint64_t now;
1793 
1794 	if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1795 		return false;
1796 	}
1797 
1798 	now = fq_codel_get_time();
1799 	for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1800 		fq_cl = &grp->fqg_classq[fq_cl_idx];
1801 		if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1802 			continue;
1803 		}
1804 		if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1805 			return false;
1806 		}
1807 	}
1808 
1809 	return true;
1810 }
1811 
1812 boolean_t
fq_if_allow_dequeue(struct ifclassq * ifq)1813 fq_if_allow_dequeue(struct ifclassq *ifq)
1814 {
1815 	fq_if_group_t *grp;
1816 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1817 
1818 	IFCQ_LOCK_ASSERT_HELD(ifq);
1819 
1820 	if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1821 		return false;
1822 	}
1823 
1824 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1825 		grp = fqs->fqs_classq_groups[grp_idx];
1826 		if (grp == NULL || FQG_BYTES(grp) == 0) {
1827 			continue;
1828 		}
1829 
1830 		if (!fq_if_is_grp_all_paced(grp)) {
1831 			return false;
1832 		}
1833 	}
1834 
1835 	return true;
1836 }
1837 
1838 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1839 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1840 {
1841 	cqrq_stat_sc_t grp_sc_stat;
1842 	fq_if_group_t *grp;
1843 	uint64_t now = fq_codel_get_time();
1844 
1845 	if (stat == NULL) {
1846 		return;
1847 	}
1848 	grp_sc_stat.sc = stat->sc;
1849 	stat->packets = 0;
1850 	stat->bytes = 0;
1851 
1852 	if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1853 		if (stat->sc == MBUF_SC_UNSPEC) {
1854 			if (!fq_if_allow_dequeue(fqs->fqs_ifq)) {
1855 				stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1856 				stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1857 			}
1858 		} else {
1859 			for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1860 				grp = fqs->fqs_classq_groups[grp_idx];
1861 				if (grp == NULL) {
1862 					continue;
1863 				}
1864 
1865 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1866 				stat->packets += grp_sc_stat.packets;
1867 				stat->bytes += grp_sc_stat.bytes;
1868 			}
1869 		}
1870 		return;
1871 	}
1872 
1873 	if (stat->sc == MBUF_SC_UNSPEC) {
1874 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1875 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1876 				if (fq_if_is_grp_all_paced(grp)) {
1877 					continue;
1878 				}
1879 				stat->packets += FQG_LEN(grp);
1880 				stat->bytes += FQG_BYTES(grp);
1881 			}
1882 		} else {
1883 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1884 			if (!fq_if_is_grp_all_paced(grp)) {
1885 				stat->packets = FQG_LEN(grp);
1886 				stat->bytes = FQG_BYTES(grp);
1887 			}
1888 		}
1889 	} else {
1890 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1891 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1892 				if (fq_if_is_grp_all_paced(grp)) {
1893 					continue;
1894 				}
1895 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1896 				stat->packets += grp_sc_stat.packets;
1897 				stat->bytes += grp_sc_stat.bytes;
1898 			}
1899 		} else {
1900 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1901 			fq_if_grp_stat_sc(fqs, grp, stat, now);
1902 		}
1903 	}
1904 }
1905 
1906 int
fq_if_request(struct ifclassq * ifq,cqrq_t rq,void * arg)1907 fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg)
1908 {
1909 	int err = 0;
1910 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1911 
1912 	IFCQ_LOCK_ASSERT_HELD(ifq);
1913 
1914 	/*
1915 	 * These are usually slow operations, convert the lock ahead of time
1916 	 */
1917 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1918 	switch (rq) {
1919 	case CLASSQRQ_PURGE:
1920 		fq_if_purge(fqs);
1921 		VERIFY(IFCQ_IS_EMPTY(ifq));
1922 		break;
1923 	case CLASSQRQ_PURGE_SC:
1924 		fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1925 		break;
1926 	case CLASSQRQ_EVENT:
1927 		fq_if_event(fqs, *(cqev_t *)arg);
1928 		break;
1929 	case CLASSQRQ_THROTTLE:
1930 		fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1931 		break;
1932 	case CLASSQRQ_STAT_SC:
1933 		fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1934 		break;
1935 	}
1936 	return err;
1937 }
1938 
1939 #if (DEBUG || DEVELOPMENT)
1940 static int
1941 fq_if_configure_target_sysctl SYSCTL_HANDLER_ARGS
1942 {
1943 #pragma unused(oidp, arg2)
1944 	fq_if_t *__single fqs = arg1;
1945 	uint64_t *target_delay;
1946 	uint64_t new_target = 0;
1947 	int changed;
1948 	int error;
1949 
1950 	if (fqs->fqs_ifq == NULL || !IFCQ_IS_ENABLED(fqs->fqs_ifq) || fqs->fqs_classq_groups[0] == NULL) {
1951 		return ENXIO;
1952 	}
1953 
1954 	target_delay = &fqs->fqs_classq_groups[0]->fqg_target_qdelays[FQ_TFC_C];
1955 	error = sysctl_io_number(req, *target_delay,
1956 	    sizeof(*target_delay), &new_target, &changed);
1957 	if (error == 0 && changed != 0) {
1958 		*target_delay = new_target;
1959 	}
1960 	return error;
1961 }
1962 #endif /* (DEBUG || DEVELOPMENT) */
1963 
1964 
1965 
1966 static int
fq_if_setup_common(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype,boolean_t legacy)1967 fq_if_setup_common(struct ifclassq *ifq, u_int32_t flags,
1968     classq_pkt_type_t ptype, boolean_t legacy)
1969 {
1970 	fq_if_t *fqs = NULL;
1971 	int err = 0;
1972 
1973 	IFCQ_LOCK_ASSERT_HELD(ifq);
1974 	VERIFY(ifq->ifcq_disc == NULL);
1975 	VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1976 
1977 	fqs = fq_if_alloc(ifq, ptype);
1978 	if (fqs == NULL) {
1979 		return ENOMEM;
1980 	}
1981 	if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1982 		fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1983 		fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1984 	} else {
1985 		fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1986 	}
1987 
1988 	if (legacy) {
1989 		fqs->fqs_dequeue = fq_codel_dq_legacy;
1990 		fqs->fqs_enqueue = fq_codel_enq_legacy;
1991 		fqs->fqs_flags |= FQS_LEGACY;
1992 		err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1993 	} else {
1994 		fqs->fqs_dequeue = fq_codel_dq;
1995 		fqs->fqs_enqueue = fq_codel_enq;
1996 		err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL_NEW, fqs);
1997 	}
1998 
1999 #if (DEBUG || DEVELOPMENT)
2000 	struct ifnet *ifp = ifq->ifcq_ifp;
2001 	struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid;
2002 	skoid_create(fqs_skoid,
2003 	    SKOID_SNODE(_net_classq_fq_codel_params), if_name(ifp),
2004 	    CTLFLAG_RW);
2005 	skoid_add_handler((struct skoid *)fqs_skoid, "target_delay", CTLFLAG_RW,
2006 	    fq_if_configure_target_sysctl, fqs, 0);
2007 #endif /* (DEBUG || DEVELOPMENT) */
2008 
2009 	if (err != 0) {
2010 		os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
2011 		    "failed to attach fq_if: %d\n", __func__, err);
2012 		fq_if_destroy(fqs);
2013 		return err;
2014 	}
2015 
2016 	/*
2017 	 * Always create one group. If qset 0 is added later,
2018 	 * this group will be updated.
2019 	 */
2020 	err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
2021 	if (err != 0) {
2022 		os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
2023 		    "failed to create a fq group: %d\n", __func__, err);
2024 		fq_if_destroy(fqs);
2025 	}
2026 
2027 	return err;
2028 }
2029 
2030 int
fq_if_setup_legacy(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)2031 fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags,
2032     classq_pkt_type_t ptype)
2033 {
2034 	return fq_if_setup_common(ifq, flags, ptype, true);
2035 }
2036 
2037 int
fq_if_setup_new(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)2038 fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags,
2039     classq_pkt_type_t ptype)
2040 {
2041 	return fq_if_setup_common(ifq, flags, ptype, false);
2042 }
2043 
2044 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,uint32_t flowid,mbuf_svc_class_t svc_class,uint64_t now,uint8_t pkt_proto,uint8_t pkt_flowsrc,fq_tfc_type_t tfc_type,bool create)2045 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid,
2046     mbuf_svc_class_t svc_class, uint64_t now, uint8_t pkt_proto,
2047     uint8_t pkt_flowsrc, fq_tfc_type_t tfc_type, bool create)
2048 {
2049 	fq_t *fq = NULL;
2050 	flowq_list_t *fq_list;
2051 	fq_if_classq_t *fq_cl;
2052 	uint32_t fqs_hash_id;
2053 	u_int8_t scidx;
2054 
2055 	scidx = fq_if_service_to_priority(fqs, svc_class);
2056 
2057 	fqs_hash_id = fq_if_flow_hash_id(flowid);
2058 
2059 	fq_list = &fqs->fqs_flows[fqs_hash_id];
2060 
2061 	LIST_FOREACH(fq, fq_list, fq_hashlink) {
2062 		if (fq->fq_flowhash == flowid &&
2063 		    fq->fq_sc_index == scidx &&
2064 		    fq->fq_tfc_type == tfc_type &&
2065 		    fq->fq_group == fq_grp) {
2066 			break;
2067 		}
2068 	}
2069 	if (fq == NULL && create) {
2070 		/* If the flow is not already on the list, allocate it */
2071 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2072 		fq = fq_alloc(fqs->fqs_ptype);
2073 		if (fq != NULL) {
2074 			fq->fq_flowhash = flowid;
2075 			fq->fq_sc_index = scidx;
2076 			fq->fq_group = fq_grp;
2077 			fq->fq_tfc_type = tfc_type;
2078 			fq_cl = &FQ_CLASSQ(fq);
2079 			fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
2080 			fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
2081 			fq->fq_next_tx_time = FQ_INVALID_TX_TS;
2082 			LIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
2083 			fq_cl->fcl_stat.fcl_flows_cnt++;
2084 			fq->fq_flags |= fq_codel_enable_ecn ? FQF_ECN_CAPABLE : 0;
2085 			if (
2086 #if (DEBUG || DEVELOPMENT)
2087 				ifclassq_congestion_feedback &&
2088 #endif /* (DEBUG || DEVELOPMENT) */
2089 				tfc_type != FQ_TFC_L4S &&
2090 				!(fqs->fqs_flags & FQS_LEGACY) &&
2091 				(pkt_proto == IPPROTO_TCP || pkt_proto == IPPROTO_QUIC) &&
2092 				(pkt_flowsrc == FLOWSRC_INPCB || pkt_flowsrc == FLOWSRC_CHANNEL)) {
2093 				FQ_ENABLE_CONGESTION_FEEDBACK(fq);
2094 				fq->fq_flowsrc = pkt_flowsrc;
2095 			}
2096 		}
2097 		KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
2098 		    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
2099 		    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
2100 	} else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
2101 		fq_if_reuse_empty_flow(fqs, fq, now);
2102 	}
2103 
2104 	/*
2105 	 * If getq time is not set because this is the first packet or after
2106 	 * idle time, set it now so that we can detect a stall.
2107 	 */
2108 	if (fq != NULL && fq->fq_getqtime == 0) {
2109 		fq->fq_getqtime = now;
2110 	}
2111 
2112 	return fq;
2113 }
2114 
2115 static void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)2116 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
2117 {
2118 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
2119 	LIST_REMOVE(fq, fq_hashlink);
2120 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2121 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
2122 		fq_if_flow_feedback(fqs, fq, fq_cl);
2123 	}
2124 	KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
2125 	    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
2126 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
2127 	fq_destroy(fq, fqs->fqs_ptype);
2128 }
2129 
2130 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)2131 fq_if_at_drop_limit(fq_if_t *fqs)
2132 {
2133 	return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
2134 	       TRUE : FALSE;
2135 }
2136 
2137 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)2138 fq_if_almost_at_drop_limit(fq_if_t *fqs)
2139 {
2140 	/*
2141 	 * Whether we are above 90% of the queue limit. This is used to tell if we
2142 	 * can stop flow controlling the largest flow.
2143 	 */
2144 	return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
2145 }
2146 
2147 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)2148 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
2149 {
2150 	ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
2151 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
2152 	STAILQ_NEXT(fq, fq_actlink) = NULL;
2153 	fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
2154 	fq->fq_empty_purge_time = 0;
2155 	fq->fq_getqtime = 0;
2156 	fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
2157 	fqs->fqs_empty_list_cnt--;
2158 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
2159 	fq_cl->fcl_stat.fcl_flows_cnt++;
2160 }
2161 
2162 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)2163 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
2164     uint64_t now)
2165 {
2166 	ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
2167 	fq->fq_empty_purge_time = now + fq_empty_purge_delay;
2168 	TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
2169 	fq->fq_flags |= FQF_EMPTY_FLOW;
2170 	FQ_CLEAR_OVERWHELMING(fq);
2171 	fqs->fqs_empty_list_cnt++;
2172 	/*
2173 	 * fcl_flows_cnt is used in budget determination for the class.
2174 	 * empty flow shouldn't contribute to the budget.
2175 	 */
2176 	fq_cl->fcl_stat.fcl_flows_cnt--;
2177 }
2178 
2179 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)2180 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
2181 {
2182 	fq_if_classq_t *fq_cl;
2183 	fq_cl = &FQ_CLASSQ(fq);
2184 
2185 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
2186 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
2187 	fq->fq_flags &= ~FQF_EMPTY_FLOW;
2188 	fqs->fqs_empty_list_cnt--;
2189 	/* Remove from the hash list and free the flow queue */
2190 	fq_if_destroy_flow(fqs, fq_cl, fq);
2191 }
2192 
2193 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)2194 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
2195 {
2196 	fq_t *fq, *tmp;
2197 	int i = 0;
2198 
2199 	if (fqs->fqs_empty_list_cnt == 0) {
2200 		ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
2201 		return;
2202 	}
2203 
2204 	TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
2205 		if (!purge_all && ((now < fq->fq_empty_purge_time) ||
2206 		    (i++ == FQ_EMPTY_PURGE_MAX))) {
2207 			break;
2208 		}
2209 		fq_if_purge_empty_flow(fqs, fq);
2210 	}
2211 
2212 	if (__improbable(purge_all)) {
2213 		VERIFY(fqs->fqs_empty_list_cnt == 0);
2214 		VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
2215 	}
2216 }
2217 
2218 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)2219 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
2220     uint64_t now)
2221 {
2222 	/*
2223 	 * Remove the flow queue from the old flows list.
2224 	 */
2225 	STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
2226 	fq->fq_flags &= ~FQF_OLD_FLOW;
2227 	fq_cl->fcl_stat.fcl_oldflows_cnt--;
2228 	VERIFY(fq->fq_bytes == 0);
2229 
2230 	/* release any flow control */
2231 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
2232 		fq_if_flow_feedback(fqs, fq, fq_cl);
2233 	}
2234 
2235 	/* move the flow queue to empty flows list */
2236 	fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
2237 }
2238 
2239 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)2240 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
2241 {
2242 	/* Move to the end of old queue list */
2243 	STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
2244 	    flowq, fq_actlink);
2245 	fq->fq_flags &= ~FQF_NEW_FLOW;
2246 	fq_cl->fcl_stat.fcl_newflows_cnt--;
2247 
2248 	STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
2249 	fq->fq_flags |= FQF_OLD_FLOW;
2250 	fq_cl->fcl_stat.fcl_oldflows_cnt++;
2251 }
2252 
2253 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)2254 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
2255 {
2256 	fq_t *fq = fqs->fqs_large_flow;
2257 	fq_if_classq_t *fq_cl;
2258 	pktsched_pkt_t pkt;
2259 	volatile uint32_t *__single pkt_flags;
2260 	uint64_t *__single pkt_timestamp;
2261 
2262 	if (fq == NULL) {
2263 		return;
2264 	}
2265 	/* queue can not be empty on the largest flow */
2266 	VERIFY(!fq_empty(fq, fqs->fqs_ptype));
2267 
2268 	fq_cl = &FQ_CLASSQ(fq);
2269 	_PKTSCHED_PKT_INIT(&pkt);
2270 	fq_getq_flow_internal(fqs, fq, &pkt);
2271 	ASSERT(pkt.pktsched_ptype != QP_INVALID);
2272 
2273 	pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
2274 	    NULL, NULL, NULL);
2275 
2276 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2277 	*pkt_timestamp = 0;
2278 	switch (pkt.pktsched_ptype) {
2279 	case QP_MBUF:
2280 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
2281 		break;
2282 #if SKYWALK
2283 	case QP_PACKET:
2284 		/* sanity check */
2285 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
2286 		break;
2287 #endif /* SKYWALK */
2288 	default:
2289 		VERIFY(0);
2290 		/* NOTREACHED */
2291 		__builtin_unreachable();
2292 	}
2293 
2294 	if (fq_empty(fq, fqs->fqs_ptype)) {
2295 		fqs->fqs_large_flow = NULL;
2296 		if (fq->fq_flags & FQF_OLD_FLOW) {
2297 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2298 		} else {
2299 			VERIFY(fq->fq_flags & FQF_NEW_FLOW);
2300 			fq_if_empty_new_flow(fq, fq_cl);
2301 		}
2302 	}
2303 	IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
2304 
2305 	if (__improbable(droptap_verbose > 0)) {
2306 		pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_DROP,
2307 		    __func__, __LINE__, 0);
2308 	} else {
2309 		pktsched_free_pkt(&pkt);
2310 	}
2311 	fq_cl->fcl_stat.fcl_drop_overflow++;
2312 }
2313 
2314 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)2315 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
2316 {
2317 	fq_t *prev_fq;
2318 
2319 	if (fqs->fqs_large_flow != NULL &&
2320 	    fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2321 		fqs->fqs_large_flow = NULL;
2322 	}
2323 
2324 	if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2325 		return;
2326 	}
2327 
2328 	prev_fq = fqs->fqs_large_flow;
2329 	if (prev_fq == NULL) {
2330 		if (!fq_empty(fq, fqs->fqs_ptype)) {
2331 			fqs->fqs_large_flow = fq;
2332 		}
2333 		return;
2334 	} else if (fq->fq_bytes > prev_fq->fq_bytes) {
2335 		fqs->fqs_large_flow = fq;
2336 	}
2337 }
2338 
2339 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)2340 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2341     fq_t *fq, fq_if_classq_t *fq_cl)
2342 {
2343 	struct flowadv_fcentry *fce;
2344 
2345 #if DEBUG || DEVELOPMENT
2346 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2347 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2348 		return TRUE;
2349 	}
2350 #endif /* DEBUG || DEVELOPMENT */
2351 
2352 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2353 		if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2354 		    fce->fce_flowid == fq->fq_flowhash) {
2355 			/* Already on flowcontrol list */
2356 			return TRUE;
2357 		}
2358 	}
2359 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2360 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2361 	if (fce != NULL) {
2362 		/* XXX Add number of bytes in the queue */
2363 		STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2364 		fq_cl->fcl_stat.fcl_flow_control++;
2365 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2366 		    "flow: 0x%x, iface: %s, B:%u\n", __func__,
2367 		    fq_cl->fcl_stat.fcl_flow_control,
2368 		    fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2369 		    if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2370 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2371 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2372 		    fq->fq_bytes, fq->fq_min_qdelay);
2373 	}
2374 
2375 	if (fce != NULL && fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
2376 		kern_channel_flowadv_set(fce);
2377 	}
2378 
2379 	return (fce != NULL) ? TRUE : FALSE;
2380 }
2381 
2382 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)2383 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2384 {
2385 	STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2386 	STAILQ_NEXT(fce, fce_link) = NULL;
2387 	flowadv_add_entry(fce);
2388 }
2389 
2390 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)2391 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2392 {
2393 	struct flowadv_fcentry *fce = NULL;
2394 
2395 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2396 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2397 		if (fce->fce_flowid == fq->fq_flowhash) {
2398 			break;
2399 		}
2400 	}
2401 	if (fce != NULL) {
2402 		fq_cl->fcl_stat.fcl_flow_feedback++;
2403 		fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2404 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2405 		    "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2406 		    fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2407 		    fce->fce_flowsrc_type, fce->fce_flowid,
2408 		    if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2409 		    fq->fq_bytes);
2410 		fq_if_remove_fcentry(fqs, fce);
2411 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2412 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2413 		    fq->fq_bytes, fq->fq_min_qdelay);
2414 	}
2415 	fq->fq_flags &= ~FQF_FLOWCTL_ON;
2416 }
2417 
2418 boolean_t
fq_if_report_congestion(fq_if_t * fqs,pktsched_pkt_t * pkt,uint32_t congestion_cnt,uint32_t l4s_ce_cnt,uint32_t pkt_cnt)2419 fq_if_report_congestion(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t congestion_cnt,
2420     uint32_t l4s_ce_cnt, uint32_t pkt_cnt)
2421 {
2422 	struct flowadv_fcentry *fce;
2423 
2424 #if DEBUG || DEVELOPMENT
2425 	if (__improbable(ifclassq_flow_control_adv == 0)) {
2426 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2427 		return TRUE;
2428 	}
2429 #endif /* DEBUG || DEVELOPMENT */
2430 
2431 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2432 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2433 	if (fce != NULL) {
2434 		fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2435 		fce->fce_congestion_cnt = congestion_cnt;
2436 		fce->l4s_ce_cnt = l4s_ce_cnt;
2437 		fce->fce_pkts_since_last_report = pkt_cnt;
2438 
2439 		flowadv_add_entry(fce);
2440 	}
2441 	return (fce != NULL) ? TRUE : FALSE;
2442 }
2443 
2444 
2445 void
fq_if_dequeue_class(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now,bool * fq_cl_paced,uint64_t * next_tx_time)2446 fq_if_dequeue_class(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2447     int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2448     uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2449     bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2450     uint64_t *next_tx_time)
2451 {
2452 	fq_t *fq = NULL, *tfq = NULL;
2453 	flowq_stailq_t temp_stailq;
2454 	uint32_t pktcnt, bytecnt;
2455 	boolean_t qempty, limit_reached = FALSE;
2456 	bool all_paced = true;
2457 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2458 	fq_getq_flow_t fq_getq_flow_fn;
2459 	classq_pkt_t *head, *tail;
2460 	uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2461 
2462 	switch (fqs->fqs_ptype) {
2463 	case QP_MBUF:
2464 		fq_getq_flow_fn = fq_getq_flow_mbuf;
2465 		break;
2466 
2467 #if SKYWALK
2468 	case QP_PACKET:
2469 		fq_getq_flow_fn = fq_getq_flow_kpkt;
2470 		break;
2471 #endif /* SKYWALK */
2472 
2473 	default:
2474 		VERIFY(0);
2475 		/* NOTREACHED */
2476 		__builtin_unreachable();
2477 	}
2478 
2479 	/*
2480 	 * maximum byte limit should not be greater than the budget for
2481 	 * this class
2482 	 */
2483 	if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2484 		bytelimit = fq_cl->fcl_budget;
2485 	}
2486 
2487 	VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2488 	pktcnt = bytecnt = 0;
2489 	STAILQ_INIT(&temp_stailq);
2490 
2491 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2492 		ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2493 		    FQF_NEW_FLOW);
2494 		uint64_t fq_tx_time;
2495 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2496 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2497 			if (fq_tx_time < fq_cl_tx_time) {
2498 				fq_cl_tx_time = fq_tx_time;
2499 			}
2500 			continue;
2501 		}
2502 		all_paced = false;
2503 
2504 		if (fq_dqlist != NULL) {
2505 			if (!fq->fq_in_dqlist) {
2506 				fq_dqlist_add(fq_dqlist, fq);
2507 			}
2508 			head = &fq->fq_dq_head;
2509 			tail = &fq->fq_dq_tail;
2510 		} else {
2511 			ASSERT(!fq->fq_in_dqlist);
2512 			head = top;
2513 			tail = &last;
2514 		}
2515 
2516 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2517 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2518 
2519 		/*
2520 		 * From RFC 8290:
2521 		 * if that queue has a negative number of credits (i.e., it has already
2522 		 * dequeued at least a quantum of bytes), it is given an additional
2523 		 * quantum of credits, the queue is put onto _the end of_ the list of
2524 		 * old queues, and the routine selects the next queue and starts again.
2525 		 */
2526 		if (fq->fq_deficit <= 0 || qempty) {
2527 			fq->fq_deficit += fq_cl->fcl_quantum;
2528 			fq_if_empty_new_flow(fq, fq_cl);
2529 		}
2530 		//TODO: add credit when it's now paced? so that the fq is trated the same as empty
2531 
2532 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2533 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2534 			if (fq_tx_time < fq_cl_tx_time) {
2535 				fq_cl_tx_time = fq_tx_time;
2536 			}
2537 		}
2538 
2539 		if (limit_reached) {
2540 			goto done;
2541 		}
2542 	}
2543 
2544 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2545 		VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2546 		    FQF_OLD_FLOW);
2547 		bool destroy = true;
2548 		uint64_t fq_tx_time;
2549 
2550 		if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2551 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2552 			if (fq_tx_time < fq_cl_tx_time) {
2553 				fq_cl_tx_time = fq_tx_time;
2554 			}
2555 			continue;
2556 		}
2557 		all_paced = false;
2558 
2559 		if (fq_dqlist != NULL) {
2560 			if (!fq->fq_in_dqlist) {
2561 				fq_dqlist_add(fq_dqlist, fq);
2562 			}
2563 			head = &fq->fq_dq_head;
2564 			tail = &fq->fq_dq_tail;
2565 			destroy = false;
2566 		} else {
2567 			ASSERT(!fq->fq_in_dqlist);
2568 			head = top;
2569 			tail = &last;
2570 		}
2571 
2572 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2573 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2574 
2575 		if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2576 			ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2577 			if (fq_tx_time < fq_cl_tx_time) {
2578 				fq_cl_tx_time = fq_tx_time;
2579 			}
2580 		}
2581 
2582 		if (qempty) {
2583 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2584 		} else if (fq->fq_deficit <= 0) {
2585 			STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2586 			    flowq, fq_actlink);
2587 			/*
2588 			 * Move to the end of the old queues list. We do not
2589 			 * need to update the flow count since this flow
2590 			 * will be added to the tail again
2591 			 */
2592 			STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2593 			fq->fq_deficit += fq_cl->fcl_quantum;
2594 		}
2595 		if (limit_reached) {
2596 			break;
2597 		}
2598 	}
2599 
2600 done:
2601 	if (all_paced) {
2602 		fq_cl->fcl_flags |= FCL_PACED;
2603 		fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2604 	}
2605 	if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2606 		STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2607 	} else if (!STAILQ_EMPTY(&temp_stailq)) {
2608 		fq_cl->fcl_old_flows = temp_stailq;
2609 	}
2610 	if (last.cp_mbuf != NULL) {
2611 		VERIFY(top->cp_mbuf != NULL);
2612 		if (bottom != NULL) {
2613 			*bottom = last;
2614 		}
2615 	}
2616 	if (retpktcnt != NULL) {
2617 		*retpktcnt = pktcnt;
2618 	}
2619 	if (retbytecnt != NULL) {
2620 		*retbytecnt = bytecnt;
2621 	}
2622 	if (fq_cl_paced != NULL) {
2623 		*fq_cl_paced = all_paced;
2624 	}
2625 	if (next_tx_time != NULL) {
2626 		*next_tx_time = fq_cl_tx_time;
2627 	}
2628 }
2629 
2630 void
fq_if_teardown(struct ifclassq * ifq)2631 fq_if_teardown(struct ifclassq *ifq)
2632 {
2633 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2634 
2635 	IFCQ_LOCK_ASSERT_HELD(ifq);
2636 	VERIFY(fqs != NULL);
2637 	VERIFY(ifq->ifcq_type == PKTSCHEDT_FQ_CODEL || ifq->ifcq_type == PKTSCHEDT_FQ_CODEL_NEW);
2638 	fq_if_destroy(fqs);
2639 	ifq->ifcq_disc = NULL;
2640 	ifclassq_detach(ifq);
2641 }
2642 
2643 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)2644 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2645     struct fq_codel_flowstats *flowstat)
2646 {
2647 	bzero(flowstat, sizeof(*flowstat));
2648 	flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2649 	flowstat->fqst_bytes = fq->fq_bytes;
2650 	flowstat->fqst_flowhash = fq->fq_flowhash;
2651 	if (fq->fq_flags & FQF_NEW_FLOW) {
2652 		flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2653 	}
2654 	if (fq->fq_flags & FQF_OLD_FLOW) {
2655 		flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2656 	}
2657 	if (fq->fq_flags & FQF_DELAY_HIGH) {
2658 		flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2659 	}
2660 	if (fq->fq_flags & FQF_FLOWCTL_ON) {
2661 		flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2662 	}
2663 	if (fqs->fqs_large_flow == fq) {
2664 		flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2665 	}
2666 }
2667 
2668 int
fq_if_getqstats(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2669 fq_if_getqstats(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2670     struct if_ifclassq_stats *ifqs)
2671 {
2672 	struct fq_codel_classstats *fcls;
2673 	fq_if_classq_t *fq_cl;
2674 	fq_if_t *fqs;
2675 	fq_t *fq = NULL;
2676 	fq_if_group_t *grp;
2677 	u_int32_t i, flowstat_cnt;
2678 
2679 	if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2680 		return EINVAL;
2681 	}
2682 
2683 	fqs = (fq_if_t *)ifq->ifcq_disc;
2684 	if (fqs->fqs_classq_groups[gid] == NULL) {
2685 		return ENXIO;
2686 	}
2687 
2688 	fcls = &ifqs->ifqs_fq_codel_stats;
2689 
2690 	fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2691 	grp = fq_if_find_grp(fqs, gid);
2692 
2693 	fcls->fcls_pri = fq_cl->fcl_pri;
2694 	fcls->fcls_service_class = fq_cl->fcl_service_class;
2695 	fcls->fcls_quantum = fq_cl->fcl_quantum;
2696 	fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2697 	fcls->fcls_budget = fq_cl->fcl_budget;
2698 	fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2699 	fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2700 	fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2701 	fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2702 	fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2703 	fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2704 	fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2705 	fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2706 	fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2707 	fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2708 	fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2709 	fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2710 	fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2711 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2712 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2713 	fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2714 	fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2715 	fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2716 	fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2717 	fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2718 	fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2719 	fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2720 	fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2721 	fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2722 	fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2723 	fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2724 	fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2725 	fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2726 	fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2727 	fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2728 	fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2729 	fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2730 	fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2731 	fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2732 	fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2733 	fcls->fcls_high_delay_drop = fq_cl->fcl_stat.fcl_high_delay_drop;
2734 	fcls->fcls_congestion_feedback = fq_cl->fcl_stat.fcl_congestion_feedback;
2735 
2736 	/* Gather per flow stats */
2737 	flowstat_cnt = min((fcls->fcls_newflows_cnt +
2738 	    fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2739 	i = 0;
2740 	STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2741 		if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2742 			break;
2743 		}
2744 
2745 		/* leave space for a few old flows */
2746 		if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2747 		    i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2748 			break;
2749 		}
2750 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2751 		i++;
2752 	}
2753 	STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2754 		if (i >= flowstat_cnt) {
2755 			break;
2756 		}
2757 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2758 		i++;
2759 	}
2760 	VERIFY(i <= flowstat_cnt);
2761 	fcls->fcls_flowstats_cnt = i;
2762 	return 0;
2763 }
2764 
2765 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2766 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2767 {
2768 #define _FQ_CLASSQ_INIT(_grp, _s, _q)                      \
2769     fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX,         \
2770 	FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s),     \
2771 	MBUF_SC_ ## _s );
2772 
2773 	fq_if_group_t *grp;
2774 	fq_if_t *fqs;
2775 	uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2776 	struct ifnet *ifp = ifcq->ifcq_ifp;
2777 
2778 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2779 
2780 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2781 
2782 	if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2783 		grp = fqs->fqs_classq_groups[grp_idx];
2784 		goto update;
2785 	}
2786 
2787 	if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2788 		return EINVAL;
2789 	}
2790 
2791 	grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2792 	if (grp == NULL) {
2793 		return ENOMEM;
2794 	}
2795 
2796 	fqs->fqs_classq_groups[grp_idx] = grp;
2797 	grp->fqg_index = grp_idx;
2798 
2799 	quantum = fq_if_calc_quantum(ifp);
2800 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2801 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2802 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2803 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2804 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2805 	} else {
2806 		/* SIG shares same INDEX with VI */
2807 		static_assert(SCIDX_SIG == SCIDX_VI);
2808 		static_assert(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2809 
2810 		_FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2811 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2812 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2813 		_FQ_CLASSQ_INIT(grp, RD, quantum);
2814 		_FQ_CLASSQ_INIT(grp, OAM, quantum);
2815 		_FQ_CLASSQ_INIT(grp, AV, quantum);
2816 		_FQ_CLASSQ_INIT(grp, RV, quantum);
2817 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2818 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2819 		_FQ_CLASSQ_INIT(grp, CTL, quantum);
2820 	}
2821 
2822 update:
2823 	if (flags & IF_DEFAULT_GRP) {
2824 		fq_if_set_grp_combined(ifcq, grp_idx);
2825 		grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2826 	} else {
2827 		fq_if_set_grp_separated(ifcq, grp_idx);
2828 		grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2829 	}
2830 
2831 	calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2832 	fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2833 	    calc_flags);
2834 	fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2835 	    calc_flags | IF_CLASSQ_L4S);
2836 
2837 	fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2838 	    calc_flags);
2839 	fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2840 	    calc_flags | IF_CLASSQ_L4S);
2841 
2842 	return 0;
2843 #undef _FQ_CLASSQ_INIT
2844 }
2845 
2846 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2847 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2848 {
2849 	fq_if_group_t *grp;
2850 
2851 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2852 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2853 
2854 	grp = fqs->fqs_classq_groups[grp_idx];
2855 	VERIFY(grp != NULL);
2856 
2857 	return grp;
2858 }
2859 
2860 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2861 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2862 {
2863 	for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2864 		fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2865 	}
2866 
2867 	bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2868 	grp->fqg_len = 0;
2869 	grp->fqg_bytes = 0;
2870 	fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2871 }
2872 
2873 void
fq_if_destroy_grps(fq_if_t * fqs)2874 fq_if_destroy_grps(fq_if_t *fqs)
2875 {
2876 	fq_if_group_t *__single grp;
2877 
2878 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2879 
2880 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2881 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2882 			continue;
2883 		}
2884 
2885 		grp = fq_if_find_grp(fqs, grp_idx);
2886 		fq_if_purge_grp(fqs, grp);
2887 		zfree(fq_if_grp_zone, grp);
2888 		fqs->fqs_classq_groups[grp_idx] = NULL;
2889 	}
2890 }
2891 
2892 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2893 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2894 {
2895 	return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2896 }
2897 
2898 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2899 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2900 {
2901 	fq_if_t *fqs;
2902 	fq_if_group_t *grp;
2903 
2904 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2905 
2906 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2907 	grp = fq_if_find_grp(fqs, grp_idx);
2908 
2909 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
2910 		return;
2911 	}
2912 
2913 	/*
2914 	 * We keep the current fq_deficit and fcl_budget when combining a group.
2915 	 * That might disrupt the AQM but only for a moment.
2916 	 */
2917 	pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2918 	TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2919 }
2920 
2921 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2922 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2923 {
2924 	fq_if_t *fqs;
2925 	fq_if_group_t *grp;
2926 
2927 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2928 
2929 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2930 	grp = fq_if_find_grp(fqs, grp_idx);
2931 
2932 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2933 		return;
2934 	}
2935 
2936 	pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2937 	TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2938 }
2939