xref: /xnu-8796.101.5/bsd/net/pktsched/pktsched_fq_codel.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/classq/classq.h>
36 #include <net/classq/classq_fq_codel.h>
37 #include <net/pktsched/pktsched_fq_codel.h>
38 #include <os/log.h>
39 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
40 
41 #define FQ_CODEL_DEFAULT_QUANTUM 1500
42 
43 #define FQ_CODEL_QUANTUM_BK_SYS(_q)    (_q)
44 #define FQ_CODEL_QUANTUM_BK(_q)        (_q)
45 #define FQ_CODEL_QUANTUM_BE(_q)        (_q)
46 #define FQ_CODEL_QUANTUM_RD(_q)        (_q)
47 #define FQ_CODEL_QUANTUM_OAM(_q)       (_q)
48 #define FQ_CODEL_QUANTUM_AV(_q)        (_q * 2)
49 #define FQ_CODEL_QUANTUM_RV(_q)        (_q * 2)
50 #define FQ_CODEL_QUANTUM_VI(_q)        (_q * 2)
51 #define FQ_CODEL_QUANTUM_VO(_q)        ((_q * 2) / 5)
52 #define FQ_CODEL_QUANTUM_CTL(_q)       ((_q * 2) / 5)
53 
54 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
55 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
56 
57 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
58 #if (DEVELOPMENT || DEBUG)
59 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
60     0, "FQ-CODEL parameters");
61 
62 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
63     CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
64 #endif /* !DEVELOPMENT && !DEBUG */
65 
66 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
67 
68 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
69 static void fq_if_destroy(fq_if_t *fqs);
70 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
71     uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
72 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
73     int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
74     uint32_t *, flowq_dqlist_t *, bool, uint64_t now);
75 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
76 static void fq_if_purge(fq_if_t *);
77 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
78 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
79     uint64_t);
80 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
81 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
82     fq_t *fq, uint64_t now);
83 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
84 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
85     bool purge_all);
86 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
87 static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq,
88     mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
89     classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
90     u_int32_t *retbytecnt, uint8_t grp_idx);
91 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
92     cqrq_stat_sc_t *stat);
93 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
94 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
95 static void fq_if_destroy_grps(fq_if_t *fqs);
96 
97 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
98 	[FQ_IF_CTL_INDEX]       = 8,
99 	[FQ_IF_VO_INDEX]        = 8,
100 	[FQ_IF_VI_INDEX]        = 6,
101 	[FQ_IF_RV_INDEX]        = 6,
102 	[FQ_IF_AV_INDEX]        = 6,
103 	[FQ_IF_OAM_INDEX]       = 4,
104 	[FQ_IF_RD_INDEX]        = 4,
105 	[FQ_IF_BE_INDEX]        = 4,
106 	[FQ_IF_BK_INDEX]        = 2,
107 	[FQ_IF_BK_SYS_INDEX]    = 2,
108 };
109 
110 #define FQ_CODEL_DRR_MAX(_s)    fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
111 
112 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
113     fq_if_state state);
114 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
115     fq_if_state dst_state, fq_if_state src_state);
116 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
117     fq_if_state state);
118 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
119     fq_if_state state, fq_if_group_t **selected_grp);
120 
121 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
122     fq_if_state state);
123 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
124     fq_if_state dst_state, fq_if_state src_state);
125 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
126     fq_if_state state);
127 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
128     fq_if_state state, fq_if_group_t **selected_grp);
129 
130 bitmap_ops_t fq_if_grps_bitmap_ops =
131 {
132 	.ffs    = fq_if_grps_bitmap_ffs,
133 	.zeros  = fq_if_grps_bitmap_zeros,
134 	.cpy    = fq_if_grps_bitmap_cpy,
135 	.clr    = fq_if_grps_bitmap_clr,
136 };
137 
138 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
139 {
140 	.ffs    = fq_if_grps_sc_bitmap_ffs,
141 	.zeros  = fq_if_grps_sc_bitmap_zeros,
142 	.cpy    = fq_if_grps_sc_bitmap_cpy,
143 	.clr    = fq_if_grps_sc_bitmap_clr,
144 };
145 
146 void
pktsched_fq_init(void)147 pktsched_fq_init(void)
148 {
149 	// format looks like ifcq_drr_max=8,8,6
150 	char buf[(FQ_IF_MAX_CLASSES) * 3];
151 	size_t i, len, pri_index = 0;
152 	uint32_t drr = 0;
153 	if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
154 		return;
155 	}
156 
157 	len = strlen(buf);
158 	for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
159 		if (buf[i] != ',' && buf[i] != '\0') {
160 			VERIFY(buf[i] >= '0' && buf[i] <= '9');
161 			drr = drr * 10 + buf[i] - '0';
162 			continue;
163 		}
164 		fq_codel_drr_max_values[pri_index] = drr;
165 		pri_index += 1;
166 		drr = 0;
167 	}
168 }
169 
170 #define FQ_IF_FLOW_HASH_ID(_flowid_) \
171 	(((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK)
172 
173 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
174 	(STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
175 	STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
176 
177 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
178 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
179     int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
180     uint32_t *, boolean_t *, uint32_t, uint64_t);
181 
182 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)183 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
184 {
185 	pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
186 }
187 
188 static inline uint64_t
fq_codel_get_time(void)189 fq_codel_get_time(void)
190 {
191 	struct timespec ts;
192 	uint64_t now;
193 
194 	nanouptime(&ts);
195 	now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
196 	return now;
197 }
198 
199 #if SKYWALK
200 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)201 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
202 {
203 	pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
204 }
205 #endif /* SKYWALK */
206 
207 #if SKYWALK
208 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint32_t pflags,uint64_t now)209 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
210     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
211     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
212     boolean_t *qempty, uint32_t pflags, uint64_t now)
213 {
214 	uint32_t plen;
215 	pktsched_pkt_t pkt;
216 	boolean_t limit_reached = FALSE;
217 	struct ifclassq *ifq = fqs->fqs_ifq;
218 	struct ifnet *ifp = ifq->ifcq_ifp;
219 
220 	/*
221 	 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
222 	 * all common flags need to be declared in that mask.
223 	 */
224 	ASSERT((pflags & ~PKT_F_COMMON_MASK) == 0);
225 
226 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
227 	    !KPKTQ_EMPTY(&fq->fq_kpktq)) {
228 		_PKTSCHED_PKT_INIT(&pkt);
229 		fq_getq_flow(fqs, fq, &pkt, now);
230 		ASSERT(pkt.pktsched_ptype == QP_PACKET);
231 
232 		plen = pktsched_get_pkt_len(&pkt);
233 		fq->fq_deficit -= plen;
234 		pkt.pktsched_pkt_kpkt->pkt_pflags |= pflags;
235 
236 		if (head->cp_kpkt == NULL) {
237 			*head = pkt.pktsched_pkt;
238 		} else {
239 			ASSERT(tail->cp_kpkt != NULL);
240 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
241 			tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
242 		}
243 		*tail = pkt.pktsched_pkt;
244 		tail->cp_kpkt->pkt_nextpkt = NULL;
245 		fq_cl->fcl_stat.fcl_dequeue++;
246 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
247 		*pkt_cnt += 1;
248 		*byte_cnt += plen;
249 
250 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
251 
252 		/* Check if the limit is reached */
253 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
254 			limit_reached = TRUE;
255 		}
256 	}
257 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
258 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
259 	    fq->fq_bytes, fq->fq_min_qdelay);
260 
261 	*qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
262 	return limit_reached;
263 }
264 #endif /* SKYWALK */
265 
266 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint32_t pflags,uint64_t now)267 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
268     int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
269     classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
270     boolean_t *qempty, uint32_t pflags, uint64_t now)
271 {
272 	u_int32_t plen;
273 	pktsched_pkt_t pkt;
274 	boolean_t limit_reached = FALSE;
275 	struct ifclassq *ifq = fqs->fqs_ifq;
276 	struct ifnet *ifp = ifq->ifcq_ifp;
277 
278 	while (fq->fq_deficit > 0 && limit_reached == FALSE &&
279 	    !MBUFQ_EMPTY(&fq->fq_mbufq)) {
280 		_PKTSCHED_PKT_INIT(&pkt);
281 		fq_getq_flow(fqs, fq, &pkt, now);
282 		ASSERT(pkt.pktsched_ptype == QP_MBUF);
283 
284 		plen = pktsched_get_pkt_len(&pkt);
285 		fq->fq_deficit -= plen;
286 		pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= pflags;
287 
288 		if (head->cp_mbuf == NULL) {
289 			*head = pkt.pktsched_pkt;
290 		} else {
291 			ASSERT(tail->cp_mbuf != NULL);
292 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
293 			tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
294 		}
295 		*tail = pkt.pktsched_pkt;
296 		tail->cp_mbuf->m_nextpkt = NULL;
297 		fq_cl->fcl_stat.fcl_dequeue++;
298 		fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
299 		*pkt_cnt += 1;
300 		*byte_cnt += plen;
301 
302 		ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
303 
304 		/* Check if the limit is reached */
305 		if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
306 			limit_reached = TRUE;
307 		}
308 	}
309 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
310 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq),
311 	    fq->fq_bytes, fq->fq_min_qdelay);
312 
313 	*qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
314 	return limit_reached;
315 }
316 
317 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)318 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
319 {
320 	fq_if_t *fqs;
321 
322 	fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
323 	fqs->fqs_ifq = ifq;
324 	fqs->fqs_ptype = ptype;
325 
326 	/* Configure packet drop limit across all queues */
327 	fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
328 	STAILQ_INIT(&fqs->fqs_fclist);
329 	TAILQ_INIT(&fqs->fqs_empty_list);
330 	TAILQ_INIT(&fqs->fqs_combined_grp_list);
331 
332 	return fqs;
333 }
334 
335 void
fq_if_destroy(fq_if_t * fqs)336 fq_if_destroy(fq_if_t *fqs)
337 {
338 	fq_if_purge(fqs);
339 	fq_if_destroy_grps(fqs);
340 
341 	fqs->fqs_ifq = NULL;
342 	zfree(fq_if_zone, fqs);
343 }
344 
345 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)346 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
347 {
348 	uint8_t pri;
349 
350 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
351 		switch (svc) {
352 		case MBUF_SC_BK_SYS:
353 		case MBUF_SC_BK:
354 			pri = FQ_IF_BK_INDEX;
355 			break;
356 		case MBUF_SC_BE:
357 		case MBUF_SC_RD:
358 		case MBUF_SC_OAM:
359 			pri = FQ_IF_BE_INDEX;
360 			break;
361 		case MBUF_SC_AV:
362 		case MBUF_SC_RV:
363 		case MBUF_SC_VI:
364 		case MBUF_SC_SIG:
365 			pri = FQ_IF_VI_INDEX;
366 			break;
367 		case MBUF_SC_VO:
368 		case MBUF_SC_CTL:
369 			pri = FQ_IF_VO_INDEX;
370 			break;
371 		default:
372 			pri = FQ_IF_BE_INDEX; /* Use best effort by default */
373 			break;
374 		}
375 		return pri;
376 	}
377 
378 	/* scheduler is not managed by the driver */
379 	switch (svc) {
380 	case MBUF_SC_BK_SYS:
381 		pri = FQ_IF_BK_SYS_INDEX;
382 		break;
383 	case MBUF_SC_BK:
384 		pri = FQ_IF_BK_INDEX;
385 		break;
386 	case MBUF_SC_BE:
387 		pri = FQ_IF_BE_INDEX;
388 		break;
389 	case MBUF_SC_RD:
390 		pri = FQ_IF_RD_INDEX;
391 		break;
392 	case MBUF_SC_OAM:
393 		pri = FQ_IF_OAM_INDEX;
394 		break;
395 	case MBUF_SC_AV:
396 		pri = FQ_IF_AV_INDEX;
397 		break;
398 	case MBUF_SC_RV:
399 		pri = FQ_IF_RV_INDEX;
400 		break;
401 	case MBUF_SC_VI:
402 		pri = FQ_IF_VI_INDEX;
403 		break;
404 	case MBUF_SC_SIG:
405 		pri = FQ_IF_SIG_INDEX;
406 		break;
407 	case MBUF_SC_VO:
408 		pri = FQ_IF_VO_INDEX;
409 		break;
410 	case MBUF_SC_CTL:
411 		pri = FQ_IF_CTL_INDEX;
412 		break;
413 	default:
414 		pri = FQ_IF_BE_INDEX; /* Use best effort by default */
415 		break;
416 	}
417 	return pri;
418 }
419 
420 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)421 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
422     uint32_t drr_max, uint32_t svc_class)
423 {
424 	fq_if_classq_t *fq_cl;
425 	VERIFY(pri < FQ_IF_MAX_CLASSES);
426 	fq_cl = &fqg->fqg_classq[pri];
427 
428 	VERIFY(fq_cl->fcl_quantum == 0);
429 	VERIFY(quantum != 0);
430 	fq_cl->fcl_quantum = quantum;
431 	fq_cl->fcl_pri = pri;
432 	fq_cl->fcl_drr_max = drr_max;
433 	fq_cl->fcl_service_class = svc_class;
434 	STAILQ_INIT(&fq_cl->fcl_new_flows);
435 	STAILQ_INIT(&fq_cl->fcl_old_flows);
436 }
437 
438 int
fq_if_enqueue_classq(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)439 fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
440     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
441 {
442 	uint8_t pri, grp_idx = 0;
443 	fq_if_t *fqs;
444 	fq_if_classq_t *fq_cl;
445 	fq_if_group_t *fq_group;
446 	int ret;
447 	mbuf_svc_class_t svc;
448 	pktsched_pkt_t pkt;
449 
450 	pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
451 
452 	fqs = (fq_if_t *)ifq->ifcq_disc;
453 	svc = pktsched_get_pkt_svc(&pkt);
454 #if SKYWALK
455 	if (head->cp_ptype == QP_PACKET) {
456 		grp_idx = head->cp_kpkt->pkt_qset_idx;
457 	}
458 #endif /* SKYWALK */
459 	pri = fq_if_service_to_priority(fqs, svc);
460 	VERIFY(pri < FQ_IF_MAX_CLASSES);
461 
462 	IFCQ_LOCK_SPIN(ifq);
463 	fq_group = fq_if_find_grp(fqs, grp_idx);
464 	fq_cl = &fq_group->fqg_classq[pri];
465 
466 	if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
467 		IFCQ_UNLOCK(ifq);
468 		/* BK_SYS is currently throttled */
469 		atomic_add_32(&fq_cl->fcl_stat.fcl_throttle_drops, 1);
470 		pktsched_free_pkt(&pkt);
471 		*pdrop = TRUE;
472 		ret = EQSUSPENDED;
473 		goto done;
474 	}
475 
476 	ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
477 	ret = fq_addq(fqs, fq_group, &pkt, fq_cl);
478 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
479 		if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
480 		    (1 << pri)) == 0) {
481 			/*
482 			 * this group is not in ER or EB groups,
483 			 * mark it as IB
484 			 */
485 			pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
486 		}
487 	}
488 
489 	if (__improbable(ret != 0)) {
490 		if (ret == CLASSQEQ_SUCCESS_FC) {
491 			/* packet enqueued, return advisory feedback */
492 			ret = EQFULL;
493 			*pdrop = FALSE;
494 		} else if (ret == CLASSQEQ_COMPRESSED) {
495 			ret = 0;
496 			*pdrop = FALSE;
497 		} else {
498 			IFCQ_UNLOCK(ifq);
499 			*pdrop = TRUE;
500 			pktsched_free_pkt(&pkt);
501 			switch (ret) {
502 			case CLASSQEQ_DROP:
503 				ret = ENOBUFS;
504 				goto done;
505 			case CLASSQEQ_DROP_FC:
506 				ret = EQFULL;
507 				goto done;
508 			case CLASSQEQ_DROP_SP:
509 				ret = EQSUSPENDED;
510 				goto done;
511 			default:
512 				VERIFY(0);
513 				/* NOTREACHED */
514 				__builtin_unreachable();
515 			}
516 			/* NOTREACHED */
517 			__builtin_unreachable();
518 		}
519 	} else {
520 		*pdrop = FALSE;
521 	}
522 	IFCQ_ADD_LEN(ifq, cnt);
523 	IFCQ_INC_BYTES(ifq, bytes);
524 
525 
526 	FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
527 	FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
528 
529 	IFCQ_UNLOCK(ifq);
530 done:
531 #if DEBUG || DEVELOPMENT
532 	if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
533 		ret = 0;
534 	}
535 #endif /* DEBUG || DEVELOPMENT */
536 	return ret;
537 }
538 
539 void
fq_if_dequeue_classq(struct ifclassq * ifq,classq_pkt_t * pkt,uint8_t grp_idx)540 fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx)
541 {
542 	(void) fq_if_dequeue_classq_multi(ifq, 1,
543 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
544 }
545 
546 void
fq_if_dequeue_sc_classq(struct ifclassq * ifq,mbuf_svc_class_t svc,classq_pkt_t * pkt,uint8_t grp_idx)547 fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
548     classq_pkt_t *pkt, uint8_t grp_idx)
549 {
550 	(void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1,
551 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
552 }
553 
554 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)555 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
556 {
557 	ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
558 	ASSERT(!fq->fq_in_dqlist);
559 	STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
560 	fq->fq_in_dqlist = true;
561 }
562 
563 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)564 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
565     classq_pkt_t *tail, classq_pkt_type_t ptype)
566 {
567 	ASSERT(fq->fq_in_dqlist);
568 	if (fq->fq_dq_head.cp_mbuf == NULL) {
569 		goto done;
570 	}
571 
572 	if (head->cp_mbuf == NULL) {
573 		*head = fq->fq_dq_head;
574 	} else {
575 		ASSERT(tail->cp_mbuf != NULL);
576 
577 		switch (ptype) {
578 		case QP_MBUF:
579 			ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
580 			tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
581 			ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
582 			break;
583 #if SKYWALK
584 		case QP_PACKET:
585 			ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
586 			tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
587 			ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
588 			break;
589 #endif /* SKYWALK */
590 		default:
591 			VERIFY(0);
592 			/* NOTREACHED */
593 			__builtin_unreachable();
594 		}
595 	}
596 	*tail = fq->fq_dq_tail;
597 done:
598 	STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
599 	CLASSQ_PKT_INIT(&fq->fq_dq_head);
600 	CLASSQ_PKT_INIT(&fq->fq_dq_tail);
601 	fq->fq_in_dqlist = false;
602 }
603 
604 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)605 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
606     classq_pkt_t *tail, classq_pkt_type_t ptype)
607 {
608 	fq_t *fq, *tfq;
609 
610 	STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
611 		fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
612 	}
613 }
614 
615 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)616 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
617     fq_if_group_t **selected_grp)
618 {
619 	#pragma unused(pri)
620 
621 	fq_if_group_t *grp;
622 	uint32_t highest_pri = FQ_IF_MAX_CLASSES;
623 	int ret_pri = 0;
624 
625 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
626 		uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
627 		/* bitmap is empty in this case */
628 		if (cur_pri == 0) {
629 			continue;
630 		}
631 		if (cur_pri <= highest_pri) {
632 			highest_pri = cur_pri;
633 			ret_pri = cur_pri;
634 			*selected_grp = grp;
635 		}
636 	}
637 	return ret_pri;
638 }
639 
640 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)641 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
642 {
643     #pragma unused(pri)
644 
645 	fq_if_group_t *grp;
646 
647 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
648 		if (grp->fqg_bitmaps[state] != 0) {
649 			return FALSE;
650 		}
651 	}
652 	return TRUE;
653 }
654 
655 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)656 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
657     fq_if_state src_state)
658 {
659     #pragma unused(pri)
660 
661 	fq_if_group_t *grp;
662 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
663 		grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
664 	}
665 }
666 
667 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)668 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
669 {
670     #pragma unused(pri)
671 
672 	fq_if_group_t *grp;
673 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
674 		grp->fqg_bitmaps[state] = 0;
675 	}
676 }
677 
678 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)679 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
680     fq_if_group_t **selected_grp)
681 {
682 	fq_if_group_t *grp;
683 	int ret_pri = 0;
684 
685 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
686 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
687 			/* +1 to match the semantics of pktsched_ffs */
688 			ret_pri = pri + 1;
689 			*selected_grp = grp;
690 			break;
691 		}
692 	}
693 
694 	return ret_pri;
695 }
696 
697 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)698 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
699 {
700 	fq_if_group_t *grp;
701 
702 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
703 		if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
704 			return FALSE;
705 		}
706 	}
707 	return TRUE;
708 }
709 
710 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)711 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
712     fq_if_state src_state)
713 {
714 	fq_if_group_t *grp;
715 
716 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
717 		pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
718 		    &grp->fqg_bitmaps[src_state]);
719 	}
720 }
721 
722 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)723 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
724 {
725 	fq_if_group_t *grp;
726 
727 	TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
728 		pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
729 	}
730 }
731 
732 static int
fq_if_dequeue_classq_multi_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)733 fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
734     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
735     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
736     uint8_t grp_idx)
737 {
738 	uint32_t total_pktcnt = 0, total_bytecnt = 0;
739 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
740 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
741 	classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
742 	fq_if_append_pkt_t append_pkt;
743 	flowq_dqlist_t fq_dqlist_head;
744 	fq_if_classq_t *fq_cl;
745 	fq_grp_tailq_t *grp_list, tmp_grp_list;
746 	fq_if_group_t *fq_grp = NULL;
747 	fq_if_t *fqs;
748 	uint64_t now;
749 	int pri = 0, svc_pri = 0;
750 
751 	IFCQ_LOCK_ASSERT_HELD(ifq);
752 
753 	fqs = (fq_if_t *)ifq->ifcq_disc;
754 	STAILQ_INIT(&fq_dqlist_head);
755 
756 	switch (fqs->fqs_ptype) {
757 	case QP_MBUF:
758 		append_pkt = fq_if_append_mbuf;
759 		break;
760 
761 #if SKYWALK
762 	case QP_PACKET:
763 		append_pkt = fq_if_append_pkt;
764 		break;
765 #endif /* SKYWALK */
766 
767 	default:
768 		VERIFY(0);
769 		/* NOTREACHED */
770 		__builtin_unreachable();
771 	}
772 
773 	now = fq_codel_get_time();
774 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
775 		svc_pri = fq_if_service_to_priority(fqs, svc);
776 	} else {
777 		VERIFY(svc == MBUF_SC_UNSPEC);
778 	}
779 
780 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
781 		grp_list = &fqs->fqs_combined_grp_list;
782 		VERIFY(!TAILQ_EMPTY(grp_list));
783 	} else {
784 		grp_list = &tmp_grp_list;
785 		fq_grp = fq_if_find_grp(fqs, grp_idx);
786 		TAILQ_INIT(grp_list);
787 		TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
788 	}
789 
790 	for (;;) {
791 		uint32_t pktcnt = 0, bytecnt = 0;
792 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
793 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
794 
795 		if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
796 		    fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
797 			fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
798 			fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
799 			if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
800 				break;
801 			}
802 		}
803 		pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
804 		if (pri == 0) {
805 			/*
806 			 * There are no ER flows, move the highest
807 			 * priority one from EB if there are any in that
808 			 * category
809 			 */
810 			pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
811 			VERIFY(pri > 0);
812 			VERIFY(fq_grp != NULL);
813 			pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
814 			pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
815 		}
816 		VERIFY(fq_grp != NULL);
817 		pri--; /* index starts at 0 */
818 		fq_cl = &fq_grp->fqg_classq[pri];
819 
820 		if (fq_cl->fcl_budget <= 0) {
821 			/* Update the budget */
822 			fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
823 			    fq_cl->fcl_stat.fcl_flows_cnt) *
824 			    fq_cl->fcl_quantum);
825 			if (fq_cl->fcl_budget <= 0) {
826 				goto state_change;
827 			}
828 		}
829 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
830 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
831 		    &bytecnt, &fq_dqlist_head, true, now);
832 		if (head.cp_mbuf != NULL) {
833 			ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
834 			if (first.cp_mbuf == NULL) {
835 				first = head;
836 			} else {
837 				ASSERT(last.cp_mbuf != NULL);
838 				append_pkt(&last, &head);
839 			}
840 			last = tail;
841 			append_pkt(&last, &tmp);
842 		}
843 		fq_cl->fcl_budget -= bytecnt;
844 		total_pktcnt += pktcnt;
845 		total_bytecnt += bytecnt;
846 
847 		/*
848 		 * If the class has exceeded the budget but still has data
849 		 * to send, move it to IB
850 		 */
851 state_change:
852 		VERIFY(fq_grp != NULL);
853 		if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
854 			if (fq_cl->fcl_budget <= 0) {
855 				pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
856 				pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
857 			}
858 		} else {
859 			pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
860 			VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
861 			    fq_grp->fqg_bitmaps[FQ_IF_EB] |
862 			    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
863 			fq_cl->fcl_budget = 0;
864 		}
865 		if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
866 			break;
867 		}
868 	}
869 
870 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
871 		TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
872 		VERIFY(TAILQ_EMPTY(grp_list));
873 	}
874 
875 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
876 	    fqs->fqs_ptype);
877 
878 	if (__probable(first_packet != NULL)) {
879 		*first_packet = first;
880 	}
881 	if (last_packet != NULL) {
882 		*last_packet = last;
883 	}
884 	if (retpktcnt != NULL) {
885 		*retpktcnt = total_pktcnt;
886 	}
887 	if (retbytecnt != NULL) {
888 		*retbytecnt = total_bytecnt;
889 	}
890 
891 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
892 	fq_if_purge_empty_flow_list(fqs, now, false);
893 	return 0;
894 }
895 
896 int
fq_if_dequeue_classq_multi(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)897 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
898     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
899     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
900     u_int32_t *retbytecnt, uint8_t grp_idx)
901 {
902 	return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
903 	           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
904 }
905 
906 int
fq_if_dequeue_sc_classq_multi(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)907 fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
908     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
909     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
910     uint8_t grp_idx)
911 {
912 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
913 
914 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
915 		return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt,
916 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
917 	} else {
918 		/*
919 		 * take a shortcut here since there is no need to schedule
920 		 * one single service class.
921 		 */
922 		return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt,
923 		           first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
924 	}
925 }
926 
927 static int
fq_if_dequeue_sc_classq_multi_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)928 fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
929     u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
930     classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
931     uint8_t grp_idx)
932 {
933 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
934 	uint8_t pri;
935 	u_int32_t total_pktcnt = 0, total_bytecnt = 0;
936 	fq_if_classq_t *fq_cl;
937 	classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
938 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
939 	fq_if_append_pkt_t append_pkt;
940 	flowq_dqlist_t fq_dqlist_head;
941 	fq_if_group_t *fq_grp;
942 	uint64_t now;
943 
944 	switch (fqs->fqs_ptype) {
945 	case QP_MBUF:
946 		append_pkt = fq_if_append_mbuf;
947 		break;
948 
949 #if SKYWALK
950 	case QP_PACKET:
951 		append_pkt = fq_if_append_pkt;
952 		break;
953 #endif /* SKYWALK */
954 
955 	default:
956 		VERIFY(0);
957 		/* NOTREACHED */
958 		__builtin_unreachable();
959 	}
960 
961 	STAILQ_INIT(&fq_dqlist_head);
962 	now = fq_codel_get_time();
963 
964 	pri = fq_if_service_to_priority(fqs, svc);
965 	fq_grp = fq_if_find_grp(fqs, grp_idx);
966 	fq_cl = &fq_grp->fqg_classq[pri];
967 
968 	/*
969 	 * Now we have the queue for a particular service class. We need
970 	 * to dequeue as many packets as needed, first from the new flows
971 	 * and then from the old flows.
972 	 */
973 	while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
974 	    fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
975 		classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
976 		classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
977 		u_int32_t pktcnt = 0, bytecnt = 0;
978 
979 		fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
980 		    (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
981 		    &bytecnt, &fq_dqlist_head, false, now);
982 		if (head.cp_mbuf != NULL) {
983 			if (first.cp_mbuf == NULL) {
984 				first = head;
985 			} else {
986 				ASSERT(last.cp_mbuf != NULL);
987 				append_pkt(&last, &head);
988 			}
989 			last = tail;
990 		}
991 		total_pktcnt += pktcnt;
992 		total_bytecnt += bytecnt;
993 	}
994 
995 	/*
996 	 * Mark classq as IB if it's not idle, so that we can
997 	 * start without re-init the bitmaps when it's switched
998 	 * to combined mode.
999 	 */
1000 	if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1001 		pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1002 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1003 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1004 	} else {
1005 		pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1006 		VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1007 		    fq_grp->fqg_bitmaps[FQ_IF_EB] |
1008 		    fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1009 	}
1010 
1011 	fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1012 
1013 	if (__probable(first_packet != NULL)) {
1014 		*first_packet = first;
1015 	}
1016 	if (last_packet != NULL) {
1017 		*last_packet = last;
1018 	}
1019 	if (retpktcnt != NULL) {
1020 		*retpktcnt = total_pktcnt;
1021 	}
1022 	if (retbytecnt != NULL) {
1023 		*retbytecnt = total_bytecnt;
1024 	}
1025 
1026 	IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1027 	fq_if_purge_empty_flow_list(fqs, now, false);
1028 	return 0;
1029 }
1030 
1031 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1032 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1033     uint32_t *bytesp, uint64_t now)
1034 {
1035 	fq_if_classq_t *fq_cl;
1036 	u_int32_t pkts, bytes;
1037 	pktsched_pkt_t pkt;
1038 	fq_if_group_t *grp;
1039 
1040 	fq_cl = &FQ_CLASSQ(fq);
1041 	grp = FQ_GROUP(fq);
1042 	pkts = bytes = 0;
1043 	_PKTSCHED_PKT_INIT(&pkt);
1044 	for (;;) {
1045 		fq_getq_flow(fqs, fq, &pkt, now);
1046 		if (pkt.pktsched_pkt_mbuf == NULL) {
1047 			VERIFY(pkt.pktsched_ptype == QP_INVALID);
1048 			break;
1049 		}
1050 		pkts++;
1051 		bytes += pktsched_get_pkt_len(&pkt);
1052 		pktsched_free_pkt(&pkt);
1053 	}
1054 	KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1055 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1056 
1057 	IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1058 
1059 	/* move through the flow queue states */
1060 	VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1061 	if (fq->fq_flags & FQF_NEW_FLOW) {
1062 		fq_if_empty_new_flow(fq, fq_cl);
1063 	}
1064 	if (fq->fq_flags & FQF_OLD_FLOW) {
1065 		fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1066 	}
1067 	if (fq->fq_flags & FQF_EMPTY_FLOW) {
1068 		fq_if_purge_empty_flow(fqs, fq);
1069 		fq = NULL;
1070 	}
1071 
1072 	if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1073 		int i;
1074 		for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1075 			pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1076 		}
1077 	}
1078 
1079 	if (pktsp != NULL) {
1080 		*pktsp = pkts;
1081 	}
1082 	if (bytesp != NULL) {
1083 		*bytesp = bytes;
1084 	}
1085 }
1086 
1087 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1088 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1089 {
1090 	fq_t *fq, *tfq;
1091 	uint64_t now;
1092 
1093 	now = fq_codel_get_time();
1094 	/*
1095 	 * Take each flow from new/old flow list and flush mbufs
1096 	 * in that flow
1097 	 */
1098 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1099 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1100 	}
1101 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1102 		fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1103 	}
1104 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1105 	VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1106 
1107 	STAILQ_INIT(&fq_cl->fcl_new_flows);
1108 	STAILQ_INIT(&fq_cl->fcl_old_flows);
1109 	fq_cl->fcl_budget = 0;
1110 }
1111 
1112 static void
fq_if_purge(fq_if_t * fqs)1113 fq_if_purge(fq_if_t *fqs)
1114 {
1115 	uint64_t now;
1116 	fq_if_group_t *grp;
1117 	int i;
1118 
1119 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1120 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1121 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1122 			continue;
1123 		}
1124 
1125 		grp = fq_if_find_grp(fqs, grp_idx);
1126 		fq_if_purge_grp(fqs, grp);
1127 	}
1128 
1129 	now = fq_codel_get_time();
1130 	fq_if_purge_empty_flow_list(fqs, now, true);
1131 
1132 	VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1133 	VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1134 
1135 	fqs->fqs_large_flow = NULL;
1136 	for (i = 0; i < FQ_IF_HASH_TABLE_SIZE; i++) {
1137 		VERIFY(SLIST_EMPTY(&fqs->fqs_flows[i]));
1138 	}
1139 
1140 	IFCQ_LEN(fqs->fqs_ifq) = 0;
1141 	IFCQ_BYTES(fqs->fqs_ifq) = 0;
1142 }
1143 
1144 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1145 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1146 {
1147 	fq_t *fq;
1148 	uint64_t now;
1149 	fq_if_group_t *grp;
1150 
1151 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1152 	req->packets = req->bytes = 0;
1153 	VERIFY(req->flow != 0);
1154 
1155 	now = fq_codel_get_time();
1156 
1157 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1158 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1159 			continue;
1160 		}
1161 		uint32_t bytes = 0, pkts = 0;
1162 
1163 		grp = fq_if_find_grp(fqs, grp_idx);
1164 		/*
1165 		 * Packet and traffic type are needed only if we want
1166 		 * to create a flow queue.
1167 		 */
1168 		fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C);
1169 		if (fq != NULL) {
1170 			fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1171 			req->bytes += bytes;
1172 			req->packets += pkts;
1173 		}
1174 	}
1175 }
1176 
1177 static uint16_t
fq_if_calc_quantum(struct ifnet * ifp)1178 fq_if_calc_quantum(struct ifnet *ifp)
1179 {
1180 	uint16_t quantum;
1181 
1182 	switch (ifp->if_family) {
1183 	case IFNET_FAMILY_ETHERNET:
1184 		VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX);
1185 		quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN;
1186 		break;
1187 
1188 	case IFNET_FAMILY_CELLULAR:
1189 	case IFNET_FAMILY_IPSEC:
1190 	case IFNET_FAMILY_UTUN:
1191 		VERIFY(ifp->if_mtu <= UINT16_MAX);
1192 		quantum = (uint16_t)ifp->if_mtu;
1193 		break;
1194 
1195 	default:
1196 		quantum = FQ_CODEL_DEFAULT_QUANTUM;
1197 		break;
1198 	}
1199 
1200 	if ((ifp->if_hwassist & IFNET_TSOF) != 0) {
1201 		VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1202 		VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1203 		quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1204 		quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1205 	}
1206 
1207 	quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1208 #if DEBUG || DEVELOPMENT
1209 	quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1210 #endif /* DEBUG || DEVELOPMENT */
1211 	VERIFY(quantum != 0);
1212 	return quantum;
1213 }
1214 
1215 static void
fq_if_mtu_update(fq_if_t * fqs)1216 fq_if_mtu_update(fq_if_t *fqs)
1217 {
1218 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q)                     \
1219 	(_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum =        \
1220 	    FQ_CODEL_QUANTUM_ ## _s(_q)                                 \
1221 
1222 	uint32_t quantum;
1223 	fq_if_group_t *grp;
1224 
1225 	quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1226 
1227 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1228 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1229 			continue;
1230 		}
1231 
1232 		grp = fq_if_find_grp(fqs, grp_idx);
1233 
1234 		if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1235 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1236 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1237 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1238 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1239 		} else {
1240 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1241 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1242 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1243 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1244 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1245 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1246 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1247 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1248 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1249 			_FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1250 		}
1251 	}
1252 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1253 }
1254 
1255 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1256 fq_if_event(fq_if_t *fqs, cqev_t ev)
1257 {
1258 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1259 
1260 	switch (ev) {
1261 	case CLASSQ_EV_LINK_UP:
1262 	case CLASSQ_EV_LINK_DOWN:
1263 		fq_if_purge(fqs);
1264 		break;
1265 	case CLASSQ_EV_LINK_MTU:
1266 		fq_if_mtu_update(fqs);
1267 		break;
1268 	default:
1269 		break;
1270 	}
1271 }
1272 
1273 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1274 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1275 {
1276 	fq_if_purge_classq(fqs, fq_cl);
1277 	fqs->fqs_throttle = 1;
1278 	fq_cl->fcl_stat.fcl_throttle_on++;
1279 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1280 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1281 }
1282 
1283 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1284 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1285 {
1286 	VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1287 	fqs->fqs_throttle = 0;
1288 	fq_cl->fcl_stat.fcl_throttle_off++;
1289 	KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1290 	    fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1291 }
1292 
1293 
1294 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1295 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1296 {
1297 	struct ifclassq *ifq = fqs->fqs_ifq;
1298 	uint8_t index;
1299 	fq_if_group_t *grp;
1300 
1301 #if !MACH_ASSERT
1302 #pragma unused(ifq)
1303 #endif
1304 	IFCQ_LOCK_ASSERT_HELD(ifq);
1305 
1306 	if (!tr->set) {
1307 		tr->level = fqs->fqs_throttle;
1308 		return 0;
1309 	}
1310 
1311 	if (tr->level == fqs->fqs_throttle) {
1312 		return EALREADY;
1313 	}
1314 
1315 	/* Throttling is allowed on BK_SYS class only */
1316 	index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1317 
1318 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1319 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1320 			continue;
1321 		}
1322 		grp = fq_if_find_grp(fqs, grp_idx);
1323 		switch (tr->level) {
1324 		case IFNET_THROTTLE_OFF:
1325 			fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1326 			break;
1327 		case IFNET_THROTTLE_OPPORTUNISTIC:
1328 			fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1329 			break;
1330 		default:
1331 			break;
1332 		}
1333 	}
1334 	return 0;
1335 }
1336 
1337 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat)1338 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat)
1339 {
1340 	uint8_t pri;
1341 	fq_if_classq_t *fq_cl;
1342 
1343 	if (stat == NULL) {
1344 		return;
1345 	}
1346 
1347 	pri = fq_if_service_to_priority(fqs, stat->sc);
1348 
1349 	fq_cl = &grp->fqg_classq[pri];
1350 	stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1351 	stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1352 }
1353 
1354 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1355 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1356 {
1357 	cqrq_stat_sc_t grp_sc_stat;
1358 	fq_if_group_t *grp;
1359 
1360 	if (stat == NULL) {
1361 		return;
1362 	}
1363 	grp_sc_stat.sc = stat->sc;
1364 
1365 	if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1366 		if (stat->sc == MBUF_SC_UNSPEC) {
1367 			stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1368 			stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1369 		} else {
1370 			for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1371 				grp = fqs->fqs_classq_groups[grp_idx];
1372 				if (grp == NULL) {
1373 					continue;
1374 				}
1375 
1376 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat);
1377 				stat->packets += grp_sc_stat.packets;
1378 				stat->bytes += grp_sc_stat.bytes;
1379 			}
1380 		}
1381 		return;
1382 	}
1383 
1384 	if (stat->sc == MBUF_SC_UNSPEC) {
1385 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1386 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1387 				stat->packets += FQG_LEN(grp);
1388 				stat->bytes += FQG_BYTES(grp);
1389 			}
1390 		} else {
1391 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1392 			stat->packets = FQG_LEN(grp);
1393 			stat->bytes = FQG_BYTES(grp);
1394 		}
1395 	} else {
1396 		if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1397 			TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1398 				fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat);
1399 				stat->packets += grp_sc_stat.packets;
1400 				stat->bytes += grp_sc_stat.bytes;
1401 			}
1402 		} else {
1403 			grp = fq_if_find_grp(fqs, stat->grp_idx);
1404 			fq_if_grp_stat_sc(fqs, grp, stat);
1405 		}
1406 	}
1407 }
1408 
1409 int
fq_if_request_classq(struct ifclassq * ifq,cqrq_t rq,void * arg)1410 fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg)
1411 {
1412 	int err = 0;
1413 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1414 
1415 	IFCQ_LOCK_ASSERT_HELD(ifq);
1416 
1417 	/*
1418 	 * These are usually slow operations, convert the lock ahead of time
1419 	 */
1420 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1421 	switch (rq) {
1422 	case CLASSQRQ_PURGE:
1423 		fq_if_purge(fqs);
1424 		break;
1425 	case CLASSQRQ_PURGE_SC:
1426 		fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1427 		break;
1428 	case CLASSQRQ_EVENT:
1429 		fq_if_event(fqs, (cqev_t)arg);
1430 		break;
1431 	case CLASSQRQ_THROTTLE:
1432 		fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1433 		break;
1434 	case CLASSQRQ_STAT_SC:
1435 		fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1436 		break;
1437 	}
1438 	return err;
1439 }
1440 
1441 int
fq_if_setup_ifclassq(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)1442 fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
1443     classq_pkt_type_t ptype)
1444 {
1445 	fq_if_t *fqs = NULL;
1446 	int err = 0;
1447 
1448 	IFCQ_LOCK_ASSERT_HELD(ifq);
1449 	VERIFY(ifq->ifcq_disc == NULL);
1450 	VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1451 
1452 	fqs = fq_if_alloc(ifq, ptype);
1453 	if (fqs == NULL) {
1454 		return ENOMEM;
1455 	}
1456 	if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1457 		fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1458 		fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1459 	} else {
1460 		fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1461 	}
1462 
1463 	err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1464 	if (err != 0) {
1465 		os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
1466 		    "failed to attach fq_if: %d\n", __func__, err);
1467 		fq_if_destroy(fqs);
1468 		return err;
1469 	}
1470 
1471 	/*
1472 	 * Always create one group. If qset 0 is added later,
1473 	 * this group will be updated.
1474 	 */
1475 	err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
1476 	if (err != 0) {
1477 		os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
1478 		    "failed to create a fq group: %d\n", __func__, err);
1479 		fq_if_destroy(fqs);
1480 	}
1481 	return err;
1482 }
1483 
1484 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,u_int32_t flowid,mbuf_svc_class_t svc_class,u_int64_t now,bool create,fq_tfc_type_t tfc_type)1485 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, u_int32_t flowid,
1486     mbuf_svc_class_t svc_class, u_int64_t now, bool create,
1487     fq_tfc_type_t tfc_type)
1488 {
1489 	fq_t *fq = NULL;
1490 	flowq_list_t *fq_list;
1491 	fq_if_classq_t *fq_cl;
1492 	u_int8_t fqs_hash_id;
1493 	u_int8_t scidx;
1494 
1495 	scidx = fq_if_service_to_priority(fqs, svc_class);
1496 
1497 	fqs_hash_id = FQ_IF_FLOW_HASH_ID(flowid);
1498 
1499 	fq_list = &fqs->fqs_flows[fqs_hash_id];
1500 
1501 	SLIST_FOREACH(fq, fq_list, fq_hashlink) {
1502 		if (fq->fq_flowhash == flowid &&
1503 		    fq->fq_sc_index == scidx &&
1504 		    fq->fq_tfc_type == tfc_type &&
1505 		    fq->fq_group == fq_grp) {
1506 			break;
1507 		}
1508 	}
1509 	if (fq == NULL && create) {
1510 		/* If the flow is not already on the list, allocate it */
1511 		IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1512 		fq = fq_alloc(fqs->fqs_ptype);
1513 		if (fq != NULL) {
1514 			fq->fq_flowhash = flowid;
1515 			fq->fq_sc_index = scidx;
1516 			fq->fq_group = fq_grp;
1517 			fq->fq_tfc_type = tfc_type;
1518 			fq_cl = &FQ_CLASSQ(fq);
1519 			fq->fq_flags = FQF_FLOWCTL_CAPABLE;
1520 			fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1521 			SLIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
1522 			fq_cl->fcl_stat.fcl_flows_cnt++;
1523 		}
1524 		KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
1525 		    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1526 		    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1527 	} else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
1528 		fq_if_reuse_empty_flow(fqs, fq, now);
1529 	}
1530 
1531 	/*
1532 	 * If getq time is not set because this is the first packet or after
1533 	 * idle time, set it now so that we can detect a stall.
1534 	 */
1535 	if (fq != NULL && fq->fq_getqtime == 0) {
1536 		fq->fq_getqtime = now;
1537 	}
1538 
1539 	return fq;
1540 }
1541 
1542 void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)1543 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
1544 {
1545 	u_int8_t hash_id;
1546 
1547 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
1548 	hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash);
1549 	SLIST_REMOVE(&fqs->fqs_flows[hash_id], fq, flowq,
1550 	    fq_hashlink);
1551 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1552 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1553 		fq_if_flow_feedback(fqs, fq, fq_cl);
1554 	}
1555 	KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
1556 	    fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1557 	    AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1558 	fq_destroy(fq, fqs->fqs_ptype);
1559 }
1560 
1561 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)1562 fq_if_at_drop_limit(fq_if_t *fqs)
1563 {
1564 	return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
1565 	       TRUE : FALSE;
1566 }
1567 
1568 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)1569 fq_if_almost_at_drop_limit(fq_if_t *fqs)
1570 {
1571 	/*
1572 	 * Whether we are above 90% of the queue limit. This is used to tell if we
1573 	 * can stop flow controlling the largest flow.
1574 	 */
1575 	return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
1576 }
1577 
1578 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)1579 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
1580 {
1581 	ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
1582 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1583 	STAILQ_NEXT(fq, fq_actlink) = NULL;
1584 	fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
1585 	fq->fq_empty_purge_time = 0;
1586 	fq->fq_getqtime = 0;
1587 	fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1588 	fqs->fqs_empty_list_cnt--;
1589 	fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
1590 	fq_cl->fcl_stat.fcl_flows_cnt++;
1591 }
1592 
1593 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1594 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1595     uint64_t now)
1596 {
1597 	ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
1598 	fq->fq_empty_purge_time = now + fq_empty_purge_delay;
1599 	TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
1600 	fq->fq_flags |= FQF_EMPTY_FLOW;
1601 	FQ_CLEAR_OVERWHELMING(fq);
1602 	fqs->fqs_empty_list_cnt++;
1603 	/*
1604 	 * fcl_flows_cnt is used in budget determination for the class.
1605 	 * empty flow shouldn't contribute to the budget.
1606 	 */
1607 	fq_cl->fcl_stat.fcl_flows_cnt--;
1608 }
1609 
1610 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)1611 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
1612 {
1613 	fq_if_classq_t *fq_cl;
1614 	fq_cl = &FQ_CLASSQ(fq);
1615 
1616 	ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
1617 	TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1618 	fq->fq_flags &= ~FQF_EMPTY_FLOW;
1619 	fqs->fqs_empty_list_cnt--;
1620 	/* Remove from the hash list and free the flow queue */
1621 	fq_if_destroy_flow(fqs, fq_cl, fq);
1622 }
1623 
1624 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)1625 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
1626 {
1627 	fq_t *fq, *tmp;
1628 	int i = 0;
1629 
1630 	if (fqs->fqs_empty_list_cnt == 0) {
1631 		ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
1632 		return;
1633 	}
1634 
1635 	TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
1636 		if (!purge_all && ((now < fq->fq_empty_purge_time) ||
1637 		    (i++ == FQ_EMPTY_PURGE_MAX))) {
1638 			break;
1639 		}
1640 		fq_if_purge_empty_flow(fqs, fq);
1641 	}
1642 
1643 	if (__improbable(purge_all)) {
1644 		VERIFY(fqs->fqs_empty_list_cnt == 0);
1645 		VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1646 	}
1647 }
1648 
1649 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1650 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1651     uint64_t now)
1652 {
1653 	/*
1654 	 * Remove the flow queue from the old flows list.
1655 	 */
1656 	STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
1657 	fq->fq_flags &= ~FQF_OLD_FLOW;
1658 	fq_cl->fcl_stat.fcl_oldflows_cnt--;
1659 	VERIFY(fq->fq_bytes == 0);
1660 
1661 	/* release any flow control */
1662 	if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1663 		fq_if_flow_feedback(fqs, fq, fq_cl);
1664 	}
1665 
1666 	/* move the flow queue to empty flows list */
1667 	fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
1668 }
1669 
1670 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)1671 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
1672 {
1673 	/* Move to the end of old queue list */
1674 	STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
1675 	    flowq, fq_actlink);
1676 	fq->fq_flags &= ~FQF_NEW_FLOW;
1677 	fq_cl->fcl_stat.fcl_newflows_cnt--;
1678 
1679 	STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
1680 	fq->fq_flags |= FQF_OLD_FLOW;
1681 	fq_cl->fcl_stat.fcl_oldflows_cnt++;
1682 }
1683 
1684 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)1685 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
1686 {
1687 	fq_t *fq = fqs->fqs_large_flow;
1688 	fq_if_classq_t *fq_cl;
1689 	pktsched_pkt_t pkt;
1690 	volatile uint32_t *pkt_flags;
1691 	uint64_t *pkt_timestamp;
1692 
1693 	if (fq == NULL) {
1694 		return;
1695 	}
1696 	/* queue can not be empty on the largest flow */
1697 	VERIFY(!fq_empty(fq, fqs->fqs_ptype));
1698 
1699 	fq_cl = &FQ_CLASSQ(fq);
1700 	_PKTSCHED_PKT_INIT(&pkt);
1701 	fq_getq_flow_internal(fqs, fq, &pkt);
1702 	ASSERT(pkt.pktsched_ptype != QP_INVALID);
1703 
1704 	pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
1705 	    NULL, NULL);
1706 
1707 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1708 	*pkt_timestamp = 0;
1709 	switch (pkt.pktsched_ptype) {
1710 	case QP_MBUF:
1711 		*pkt_flags &= ~PKTF_PRIV_GUARDED;
1712 		break;
1713 #if SKYWALK
1714 	case QP_PACKET:
1715 		/* sanity check */
1716 		ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
1717 		break;
1718 #endif /* SKYWALK */
1719 	default:
1720 		VERIFY(0);
1721 		/* NOTREACHED */
1722 		__builtin_unreachable();
1723 	}
1724 
1725 	if (fq_empty(fq, fqs->fqs_ptype)) {
1726 		fqs->fqs_large_flow = NULL;
1727 		if (fq->fq_flags & FQF_OLD_FLOW) {
1728 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1729 		} else {
1730 			VERIFY(fq->fq_flags & FQF_NEW_FLOW);
1731 			fq_if_empty_new_flow(fq, fq_cl);
1732 		}
1733 	}
1734 	IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
1735 
1736 	pktsched_free_pkt(&pkt);
1737 	fq_cl->fcl_stat.fcl_drop_overflow++;
1738 }
1739 
1740 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)1741 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
1742 {
1743 	fq_t *prev_fq;
1744 
1745 	if (fqs->fqs_large_flow != NULL &&
1746 	    fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1747 		fqs->fqs_large_flow = NULL;
1748 	}
1749 
1750 	if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1751 		return;
1752 	}
1753 
1754 	prev_fq = fqs->fqs_large_flow;
1755 	if (prev_fq == NULL) {
1756 		if (!fq_empty(fq, fqs->fqs_ptype)) {
1757 			fqs->fqs_large_flow = fq;
1758 		}
1759 		return;
1760 	} else if (fq->fq_bytes > prev_fq->fq_bytes) {
1761 		fqs->fqs_large_flow = fq;
1762 	}
1763 }
1764 
1765 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)1766 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
1767     fq_t *fq, fq_if_classq_t *fq_cl)
1768 {
1769 	struct flowadv_fcentry *fce;
1770 
1771 #if DEBUG || DEVELOPMENT
1772 	if (__improbable(ifclassq_flow_control_adv == 0)) {
1773 		os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
1774 		return TRUE;
1775 	}
1776 #endif /* DEBUG || DEVELOPMENT */
1777 
1778 	ASSERT(fq->fq_tfc_type != FQ_TFC_L4S);
1779 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
1780 		if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
1781 		    fce->fce_flowid == fq->fq_flowhash) {
1782 			/* Already on flowcontrol list */
1783 			return TRUE;
1784 		}
1785 	}
1786 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1787 	fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
1788 	if (fce != NULL) {
1789 		/* XXX Add number of bytes in the queue */
1790 		STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
1791 		fq_cl->fcl_stat.fcl_flow_control++;
1792 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
1793 		    "flow: 0x%x, iface: %s\n", __func__,
1794 		    fq_cl->fcl_stat.fcl_flow_control,
1795 		    fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
1796 		    if_name(fqs->fqs_ifq->ifcq_ifp));
1797 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
1798 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
1799 		    fq->fq_bytes, fq->fq_min_qdelay);
1800 	}
1801 	return (fce != NULL) ? TRUE : FALSE;
1802 }
1803 
1804 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)1805 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
1806 {
1807 	STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
1808 	STAILQ_NEXT(fce, fce_link) = NULL;
1809 	flowadv_add_entry(fce);
1810 }
1811 
1812 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)1813 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
1814 {
1815 	struct flowadv_fcentry *fce = NULL;
1816 
1817 	if (fq->fq_tfc_type == FQ_TFC_L4S) {
1818 		return;
1819 	}
1820 
1821 	IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1822 	STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
1823 		if (fce->fce_flowid == fq->fq_flowhash) {
1824 			break;
1825 		}
1826 	}
1827 	if (fce != NULL) {
1828 		fq_cl->fcl_stat.fcl_flow_feedback++;
1829 		os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
1830 		    "flow: 0x%x, iface: %s\n", __func__,
1831 		    fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
1832 		    fce->fce_flowsrc_type, fce->fce_flowid,
1833 		    if_name(fqs->fqs_ifq->ifcq_ifp));
1834 		fq_if_remove_fcentry(fqs, fce);
1835 		KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
1836 		    fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
1837 		    fq->fq_bytes, fq->fq_min_qdelay);
1838 	}
1839 	fq->fq_flags &= ~FQF_FLOWCTL_ON;
1840 }
1841 
1842 void
fq_if_dequeue(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now)1843 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
1844     int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
1845     uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
1846     bool budget_restricted, uint64_t now)
1847 {
1848 	fq_t *fq = NULL, *tfq = NULL;
1849 	flowq_stailq_t temp_stailq;
1850 	uint32_t pktcnt, bytecnt;
1851 	boolean_t qempty, limit_reached = FALSE;
1852 	classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1853 	fq_getq_flow_t fq_getq_flow_fn;
1854 	classq_pkt_t *head, *tail;
1855 
1856 	switch (fqs->fqs_ptype) {
1857 	case QP_MBUF:
1858 		fq_getq_flow_fn = fq_getq_flow_mbuf;
1859 		break;
1860 
1861 #if SKYWALK
1862 	case QP_PACKET:
1863 		fq_getq_flow_fn = fq_getq_flow_kpkt;
1864 		break;
1865 #endif /* SKYWALK */
1866 
1867 	default:
1868 		VERIFY(0);
1869 		/* NOTREACHED */
1870 		__builtin_unreachable();
1871 	}
1872 
1873 	/*
1874 	 * maximum byte limit should not be greater than the budget for
1875 	 * this class
1876 	 */
1877 	if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
1878 		bytelimit = fq_cl->fcl_budget;
1879 	}
1880 
1881 	VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
1882 	pktcnt = bytecnt = 0;
1883 	STAILQ_INIT(&temp_stailq);
1884 
1885 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1886 		ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
1887 		    FQF_NEW_FLOW);
1888 
1889 		if (fq_dqlist != NULL) {
1890 			if (!fq->fq_in_dqlist) {
1891 				fq_dqlist_add(fq_dqlist, fq);
1892 			}
1893 			head = &fq->fq_dq_head;
1894 			tail = &fq->fq_dq_tail;
1895 		} else {
1896 			ASSERT(!fq->fq_in_dqlist);
1897 			head = top;
1898 			tail = &last;
1899 		}
1900 
1901 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
1902 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty,
1903 		    PKTF_NEW_FLOW, now);
1904 
1905 		/*
1906 		 * From RFC 8290:
1907 		 * if that queue has a negative number of credits (i.e., it has already
1908 		 * dequeued at least a quantum of bytes), it is given an additional
1909 		 * quantum of credits, the queue is put onto _the end of_ the list of
1910 		 * old queues, and the routine selects the next queue and starts again.
1911 		 */
1912 		if (fq->fq_deficit <= 0 || qempty) {
1913 			fq->fq_deficit += fq_cl->fcl_quantum;
1914 			fq_if_empty_new_flow(fq, fq_cl);
1915 		}
1916 
1917 		if (limit_reached) {
1918 			goto done;
1919 		}
1920 	}
1921 
1922 	STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1923 		VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
1924 		    FQF_OLD_FLOW);
1925 		bool destroy = true;
1926 
1927 		if (fq_dqlist != NULL) {
1928 			if (!fq->fq_in_dqlist) {
1929 				fq_dqlist_add(fq_dqlist, fq);
1930 			}
1931 			head = &fq->fq_dq_head;
1932 			tail = &fq->fq_dq_tail;
1933 			destroy = false;
1934 		} else {
1935 			ASSERT(!fq->fq_in_dqlist);
1936 			head = top;
1937 			tail = &last;
1938 		}
1939 
1940 		limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
1941 		    pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, 0, now);
1942 
1943 		if (qempty) {
1944 			fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1945 		} else if (fq->fq_deficit <= 0) {
1946 			STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
1947 			    flowq, fq_actlink);
1948 			/*
1949 			 * Move to the end of the old queues list. We do not
1950 			 * need to update the flow count since this flow
1951 			 * will be added to the tail again
1952 			 */
1953 			STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
1954 			fq->fq_deficit += fq_cl->fcl_quantum;
1955 		}
1956 		if (limit_reached) {
1957 			break;
1958 		}
1959 	}
1960 
1961 done:
1962 	if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
1963 		STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
1964 	} else if (!STAILQ_EMPTY(&temp_stailq)) {
1965 		fq_cl->fcl_old_flows = temp_stailq;
1966 	}
1967 	if (last.cp_mbuf != NULL) {
1968 		VERIFY(top->cp_mbuf != NULL);
1969 		if (bottom != NULL) {
1970 			*bottom = last;
1971 		}
1972 	}
1973 	if (retpktcnt != NULL) {
1974 		*retpktcnt = pktcnt;
1975 	}
1976 	if (retbytecnt != NULL) {
1977 		*retbytecnt = bytecnt;
1978 	}
1979 }
1980 
1981 void
fq_if_teardown_ifclassq(struct ifclassq * ifq)1982 fq_if_teardown_ifclassq(struct ifclassq *ifq)
1983 {
1984 	fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1985 
1986 	IFCQ_LOCK_ASSERT_HELD(ifq);
1987 	VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL);
1988 	fq_if_destroy(fqs);
1989 	ifq->ifcq_disc = NULL;
1990 	ifclassq_detach(ifq);
1991 }
1992 
1993 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)1994 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
1995     struct fq_codel_flowstats *flowstat)
1996 {
1997 	bzero(flowstat, sizeof(*flowstat));
1998 	flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
1999 	flowstat->fqst_bytes = fq->fq_bytes;
2000 	flowstat->fqst_flowhash = fq->fq_flowhash;
2001 	if (fq->fq_flags & FQF_NEW_FLOW) {
2002 		flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2003 	}
2004 	if (fq->fq_flags & FQF_OLD_FLOW) {
2005 		flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2006 	}
2007 	if (fq->fq_flags & FQF_DELAY_HIGH) {
2008 		flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2009 	}
2010 	if (fq->fq_flags & FQF_FLOWCTL_ON) {
2011 		flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2012 	}
2013 	if (fqs->fqs_large_flow == fq) {
2014 		flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2015 	}
2016 }
2017 
2018 int
fq_if_getqstats_ifclassq(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2019 fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2020     struct if_ifclassq_stats *ifqs)
2021 {
2022 	struct fq_codel_classstats *fcls;
2023 	fq_if_classq_t *fq_cl;
2024 	fq_if_t *fqs;
2025 	fq_t *fq = NULL;
2026 	fq_if_group_t *grp;
2027 	u_int32_t i, flowstat_cnt;
2028 
2029 	if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2030 		return EINVAL;
2031 	}
2032 
2033 	fqs = (fq_if_t *)ifq->ifcq_disc;
2034 	if (fqs->fqs_classq_groups[gid] == NULL) {
2035 		return ENXIO;
2036 	}
2037 
2038 	fcls = &ifqs->ifqs_fq_codel_stats;
2039 
2040 	fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2041 	grp = fq_if_find_grp(fqs, gid);
2042 
2043 	fcls->fcls_pri = fq_cl->fcl_pri;
2044 	fcls->fcls_service_class = fq_cl->fcl_service_class;
2045 	fcls->fcls_quantum = fq_cl->fcl_quantum;
2046 	fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2047 	fcls->fcls_budget = fq_cl->fcl_budget;
2048 	fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2049 	fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2050 	fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2051 	fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2052 	fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2053 	fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2054 	fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2055 	fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2056 	fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2057 	fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2058 	fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2059 	fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2060 	fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2061 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2062 	fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2063 	fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2064 	fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2065 	fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2066 	fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2067 	fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2068 	fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2069 	fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2070 	fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2071 	fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2072 	fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2073 	fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2074 	fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2075 	fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2076 	fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2077 	fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2078 	fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2079 
2080 	/* Gather per flow stats */
2081 	flowstat_cnt = min((fcls->fcls_newflows_cnt +
2082 	    fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2083 	i = 0;
2084 	STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2085 		if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2086 			break;
2087 		}
2088 
2089 		/* leave space for a few old flows */
2090 		if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2091 		    i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2092 			break;
2093 		}
2094 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2095 		i++;
2096 	}
2097 	STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2098 		if (i >= flowstat_cnt) {
2099 			break;
2100 		}
2101 		fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2102 		i++;
2103 	}
2104 	VERIFY(i <= flowstat_cnt);
2105 	fcls->fcls_flowstats_cnt = i;
2106 	return 0;
2107 }
2108 
2109 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2110 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2111 {
2112 #define _FQ_CLASSQ_INIT(_grp, _s, _q)                      \
2113     fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX,         \
2114 	FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s),     \
2115 	MBUF_SC_ ## _s );
2116 
2117 	fq_if_group_t *grp;
2118 	fq_if_t *fqs;
2119 	uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2120 	struct ifnet *ifp = ifcq->ifcq_ifp;
2121 
2122 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2123 
2124 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2125 
2126 	if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2127 		grp = fqs->fqs_classq_groups[grp_idx];
2128 		goto update;
2129 	}
2130 
2131 	if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2132 		return EINVAL;
2133 	}
2134 
2135 	grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2136 	if (grp == NULL) {
2137 		return ENOMEM;
2138 	}
2139 
2140 	fqs->fqs_classq_groups[grp_idx] = grp;
2141 	grp->fqg_index = grp_idx;
2142 
2143 	quantum = fq_if_calc_quantum(ifp);
2144 	if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2145 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2146 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2147 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2148 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2149 	} else {
2150 		/* SIG shares same INDEX with VI */
2151 		_CASSERT(SCIDX_SIG == SCIDX_VI);
2152 		_CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2153 
2154 		_FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2155 		_FQ_CLASSQ_INIT(grp, BK, quantum);
2156 		_FQ_CLASSQ_INIT(grp, BE, quantum);
2157 		_FQ_CLASSQ_INIT(grp, RD, quantum);
2158 		_FQ_CLASSQ_INIT(grp, OAM, quantum);
2159 		_FQ_CLASSQ_INIT(grp, AV, quantum);
2160 		_FQ_CLASSQ_INIT(grp, RV, quantum);
2161 		_FQ_CLASSQ_INIT(grp, VI, quantum);
2162 		_FQ_CLASSQ_INIT(grp, VO, quantum);
2163 		_FQ_CLASSQ_INIT(grp, CTL, quantum);
2164 	}
2165 
2166 update:
2167 	if (flags & IF_DEFAULT_GRP) {
2168 		fq_if_set_grp_combined(ifcq, grp_idx);
2169 		grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2170 	} else {
2171 		fq_if_set_grp_separated(ifcq, grp_idx);
2172 		grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2173 	}
2174 
2175 	calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2176 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2177 	    calc_flags);
2178 	ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2179 	    calc_flags | IF_CLASSQ_L4S);
2180 
2181 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2182 	    calc_flags);
2183 	ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2184 	    calc_flags | IF_CLASSQ_L4S);
2185 
2186 	return 0;
2187 #undef _FQ_CLASSQ_INIT
2188 }
2189 
2190 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2191 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2192 {
2193 	fq_if_group_t *grp;
2194 
2195 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2196 	VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2197 
2198 	grp = fqs->fqs_classq_groups[grp_idx];
2199 	VERIFY(grp != NULL);
2200 
2201 	return grp;
2202 }
2203 
2204 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2205 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2206 {
2207 	for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2208 		fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2209 	}
2210 
2211 	bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2212 	grp->fqg_len = 0;
2213 	grp->fqg_bytes = 0;
2214 	fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2215 }
2216 
2217 void
fq_if_destroy_grps(fq_if_t * fqs)2218 fq_if_destroy_grps(fq_if_t *fqs)
2219 {
2220 	fq_if_group_t *grp;
2221 
2222 	IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2223 
2224 	for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2225 		if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2226 			continue;
2227 		}
2228 
2229 		grp = fq_if_find_grp(fqs, grp_idx);
2230 		fq_if_purge_grp(fqs, grp);
2231 		zfree(fq_if_grp_zone, grp);
2232 		fqs->fqs_classq_groups[grp_idx] = NULL;
2233 	}
2234 }
2235 
2236 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2237 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2238 {
2239 	return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2240 }
2241 
2242 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2243 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2244 {
2245 	fq_if_t *fqs;
2246 	fq_if_group_t *grp;
2247 
2248 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2249 
2250 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2251 	grp = fq_if_find_grp(fqs, grp_idx);
2252 
2253 	if (fq_if_is_grp_combined(fqs, grp_idx)) {
2254 		return;
2255 	}
2256 
2257 	/*
2258 	 * We keep the current fq_deficit and fcl_budget when combining a group.
2259 	 * That might disrupt the AQM but only for a moment.
2260 	 */
2261 	pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2262 	TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2263 }
2264 
2265 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2266 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2267 {
2268 	fq_if_t *fqs;
2269 	fq_if_group_t *grp;
2270 
2271 	IFCQ_LOCK_ASSERT_HELD(ifcq);
2272 
2273 	fqs = (fq_if_t *)ifcq->ifcq_disc;
2274 	grp = fq_if_find_grp(fqs, grp_idx);
2275 
2276 	if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2277 		return;
2278 	}
2279 
2280 	pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2281 	TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2282 }
2283