1 /*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/droptap.h>
36 #include <net/classq/classq.h>
37 #include <net/classq/classq_fq_codel.h>
38 #include <net/pktsched/pktsched_fq_codel.h>
39 #include <os/log.h>
40 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
41 #include <mach/thread_act.h>
42 #include <kern/thread.h>
43 #include <kern/sched_prim.h>
44
45 #define FQ_CODEL_DEFAULT_QUANTUM 1500
46
47 #define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q)
48 #define FQ_CODEL_QUANTUM_BK(_q) (_q)
49 #define FQ_CODEL_QUANTUM_BE(_q) (_q)
50 #define FQ_CODEL_QUANTUM_RD(_q) (_q)
51 #define FQ_CODEL_QUANTUM_OAM(_q) (_q)
52 #define FQ_CODEL_QUANTUM_AV(_q) (_q * 2)
53 #define FQ_CODEL_QUANTUM_RV(_q) (_q * 2)
54 #define FQ_CODEL_QUANTUM_VI(_q) (_q * 2)
55 #define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5)
56 #define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5)
57
58 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
59 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
60
61 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
62 0, "FQ-CODEL parameters");
63
64 SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
65 &ifclassq_enable_pacing, 0, "Enable pacing");
66
67 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
68 #if (DEVELOPMENT || DEBUG)
69 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
70 CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
71 #endif /* !DEVELOPMENT && !DEBUG */
72
73 unsigned int ifclassq_enable_pacing = 1;
74
75 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
76
77 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
78 static void fq_if_destroy(fq_if_t *fqs);
79 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
80 uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
81 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
82 int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
83 uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
84 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
85 static void fq_if_purge(fq_if_t *);
86 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
87 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
88 uint64_t);
89 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
90 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
91 fq_t *fq, uint64_t now);
92 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
93 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
94 bool purge_all);
95 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
96 static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq,
97 mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
98 classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
99 u_int32_t *retbytecnt, uint8_t grp_idx);
100 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
101 cqrq_stat_sc_t *stat, uint64_t now);
102 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
103 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
104 static void fq_if_destroy_grps(fq_if_t *fqs);
105
106 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
107 [FQ_IF_CTL_INDEX] = 8,
108 [FQ_IF_VO_INDEX] = 8,
109 [FQ_IF_VI_INDEX] = 6,
110 [FQ_IF_RV_INDEX] = 6,
111 [FQ_IF_AV_INDEX] = 6,
112 [FQ_IF_OAM_INDEX] = 4,
113 [FQ_IF_RD_INDEX] = 4,
114 [FQ_IF_BE_INDEX] = 4,
115 [FQ_IF_BK_INDEX] = 2,
116 [FQ_IF_BK_SYS_INDEX] = 2,
117 };
118
119 #define FQ_CODEL_DRR_MAX(_s) fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
120
121 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
122 fq_if_state state);
123 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
124 fq_if_state dst_state, fq_if_state src_state);
125 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
126 fq_if_state state);
127 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
128 fq_if_state state, fq_if_group_t **selected_grp);
129 static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
130 fq_if_state dst_state, fq_if_state src_state);
131
132 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
133 fq_if_state state);
134 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
135 fq_if_state dst_state, fq_if_state src_state);
136 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
137 fq_if_state state);
138 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
139 fq_if_state state, fq_if_group_t **selected_grp);
140 static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
141 fq_if_state dst_state, fq_if_state src_state);
142
143 bitmap_ops_t fq_if_grps_bitmap_ops =
144 {
145 .ffs = fq_if_grps_bitmap_ffs,
146 .zeros = fq_if_grps_bitmap_zeros,
147 .cpy = fq_if_grps_bitmap_cpy,
148 .clr = fq_if_grps_bitmap_clr,
149 .move = fq_if_grps_bitmap_move,
150 };
151
152 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
153 {
154 .ffs = fq_if_grps_sc_bitmap_ffs,
155 .zeros = fq_if_grps_sc_bitmap_zeros,
156 .cpy = fq_if_grps_sc_bitmap_cpy,
157 .clr = fq_if_grps_sc_bitmap_clr,
158 .move = fq_if_grps_sc_bitmap_move,
159 };
160
161 static uint32_t fq_if_hash_table_size;
162
163 extern int serverperfmode; // Temporary to resolve build dependency
164
165 void
pktsched_fq_init(void)166 pktsched_fq_init(void)
167 {
168 PE_parse_boot_argn("ifclassq_enable_pacing", &ifclassq_enable_pacing,
169 sizeof(ifclassq_enable_pacing));
170
171 if (serverperfmode) {
172 fq_if_hash_table_size = (1 << 16);
173 } else {
174 fq_if_hash_table_size = (1 << 8);
175 }
176
177 // format looks like ifcq_drr_max=8,8,6
178 char buf[(FQ_IF_MAX_CLASSES) * 3];
179 size_t i, len, pri_index = 0;
180 uint32_t drr = 0;
181 if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
182 return;
183 }
184
185 len = strbuflen(buf, sizeof(buf));
186 for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
187 if (buf[i] != ',' && buf[i] != '\0') {
188 VERIFY(buf[i] >= '0' && buf[i] <= '9');
189 drr = drr * 10 + buf[i] - '0';
190 continue;
191 }
192 fq_codel_drr_max_values[pri_index] = drr;
193 pri_index += 1;
194 drr = 0;
195 }
196 }
197
198 static uint32_t
fq_if_flow_hash_id(uint32_t flowid)199 fq_if_flow_hash_id(uint32_t flowid)
200 {
201 return flowid & (fq_if_hash_table_size - 1);
202 }
203
204 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
205 (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
206 STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
207
208 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
209 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
210 int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
211 uint32_t *, boolean_t *, uint64_t);
212
213 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)214 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
215 {
216 pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
217 }
218
219 static inline uint64_t
fq_codel_get_time(void)220 fq_codel_get_time(void)
221 {
222 struct timespec ts;
223 uint64_t now;
224
225 nanouptime(&ts);
226 now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
227 return now;
228 }
229
230 #if SKYWALK
231 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)232 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
233 {
234 pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
235 }
236 #endif /* SKYWALK */
237
238 #if SKYWALK
239 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)240 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
241 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
242 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
243 boolean_t *qempty, uint64_t now)
244 {
245 uint32_t plen;
246 pktsched_pkt_t pkt;
247 boolean_t limit_reached = FALSE;
248 struct ifclassq *ifq = fqs->fqs_ifq;
249 struct ifnet *ifp = ifq->ifcq_ifp;
250
251 /*
252 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
253 * all common flags need to be declared in that mask.
254 */
255 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
256 !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
257 _PKTSCHED_PKT_INIT(&pkt);
258 fq_getq_flow(fqs, fq, &pkt, now);
259 ASSERT(pkt.pktsched_ptype == QP_PACKET);
260
261 plen = pktsched_get_pkt_len(&pkt);
262 fq->fq_deficit -= plen;
263 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
264 pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
265 fq->fq_flags &= ~FQF_FRESH_FLOW;
266 }
267
268 if (head->cp_kpkt == NULL) {
269 *head = pkt.pktsched_pkt;
270 } else {
271 ASSERT(tail->cp_kpkt != NULL);
272 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
273 tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
274 }
275 *tail = pkt.pktsched_pkt;
276 tail->cp_kpkt->pkt_nextpkt = NULL;
277 fq_cl->fcl_stat.fcl_dequeue++;
278 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
279 *pkt_cnt += 1;
280 *byte_cnt += plen;
281
282 ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
283
284 /* Check if the limit is reached */
285 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
286 limit_reached = TRUE;
287 }
288 }
289 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
290 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
291 fq->fq_bytes, fq->fq_min_qdelay);
292
293 *qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
294 return limit_reached;
295 }
296 #endif /* SKYWALK */
297
298 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)299 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
300 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
301 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
302 boolean_t *qempty, uint64_t now)
303 {
304 u_int32_t plen;
305 pktsched_pkt_t pkt;
306 boolean_t limit_reached = FALSE;
307 struct ifclassq *ifq = fqs->fqs_ifq;
308 struct ifnet *ifp = ifq->ifcq_ifp;
309
310 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
311 !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
312 _PKTSCHED_PKT_INIT(&pkt);
313 fq_getq_flow(fqs, fq, &pkt, now);
314 ASSERT(pkt.pktsched_ptype == QP_MBUF);
315
316 plen = pktsched_get_pkt_len(&pkt);
317 fq->fq_deficit -= plen;
318
319 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
320 pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
321 fq->fq_flags &= ~FQF_FRESH_FLOW;
322 }
323
324 if (head->cp_mbuf == NULL) {
325 *head = pkt.pktsched_pkt;
326 } else {
327 ASSERT(tail->cp_mbuf != NULL);
328 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
329 tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
330 }
331 *tail = pkt.pktsched_pkt;
332 tail->cp_mbuf->m_nextpkt = NULL;
333 fq_cl->fcl_stat.fcl_dequeue++;
334 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
335 *pkt_cnt += 1;
336 *byte_cnt += plen;
337
338 ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
339
340 /* Check if the limit is reached */
341 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
342 limit_reached = TRUE;
343 }
344 }
345 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
346 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
347 fq->fq_bytes, fq->fq_min_qdelay);
348
349 *qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
350 return limit_reached;
351 }
352
353 static void
fq_if_pacemaker_tcall(thread_call_param_t arg0,thread_call_param_t arg1)354 fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
355 {
356 #pragma unused(arg1)
357 struct ifnet* ifp = (struct ifnet*)arg0;
358 ASSERT(ifp != NULL);
359
360 ifnet_start_ignore_delay(ifp);
361 }
362
363 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)364 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
365 {
366 flowq_list_t *fqs_flows;
367 fq_if_t *fqs;
368
369 ASSERT(ifq->ifcq_ifp != NULL);
370 fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
371 if (fqs == NULL) {
372 return NULL;
373 }
374 fqs_flows = kalloc_type(flowq_list_t, fq_if_hash_table_size, Z_WAITOK | Z_ZERO);
375 if (fqs_flows == NULL) {
376 zfree(fq_if_zone, fqs);
377 return NULL;
378 }
379 fqs->fqs_flows = fqs_flows;
380 fqs->fqs_flows_count = fq_if_hash_table_size;
381 fqs->fqs_ifq = ifq;
382 fqs->fqs_ptype = ptype;
383
384 /* Configure packet drop limit across all queues */
385 fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
386 STAILQ_INIT(&fqs->fqs_fclist);
387 TAILQ_INIT(&fqs->fqs_empty_list);
388 TAILQ_INIT(&fqs->fqs_combined_grp_list);
389 fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall,
390 (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL,
391 THREAD_CALL_OPTIONS_ONCE);
392 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
393
394 return fqs;
395 }
396
397 void
fq_if_destroy(fq_if_t * fqs)398 fq_if_destroy(fq_if_t *fqs)
399 {
400 struct ifnet *ifp = fqs->fqs_ifq->ifcq_ifp;
401 thread_call_t __single tcall = fqs->fqs_pacemaker_tcall;
402
403 VERIFY(ifp != NULL);
404 ASSERT(tcall != NULL);
405 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
406 LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
407 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
408
409 /*
410 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
411 * and schedule a pacemaker call. So we do not need a sleep wait loop here
412 * cancel wait and free should succeed in one call.
413 */
414 thread_call_cancel_wait(tcall);
415 ASSERT(thread_call_free(tcall));
416
417 fq_if_purge(fqs);
418 fq_if_destroy_grps(fqs);
419
420 fqs->fqs_ifq = NULL;
421
422 kfree_type_counted_by(flowq_list_t, fqs->fqs_flows_count, fqs->fqs_flows);
423 zfree(fq_if_zone, fqs);
424 }
425
426 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)427 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
428 {
429 uint8_t pri;
430
431 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
432 switch (svc) {
433 case MBUF_SC_BK_SYS:
434 case MBUF_SC_BK:
435 pri = FQ_IF_BK_INDEX;
436 break;
437 case MBUF_SC_BE:
438 case MBUF_SC_RD:
439 case MBUF_SC_OAM:
440 pri = FQ_IF_BE_INDEX;
441 break;
442 case MBUF_SC_AV:
443 case MBUF_SC_RV:
444 case MBUF_SC_VI:
445 case MBUF_SC_SIG:
446 pri = FQ_IF_VI_INDEX;
447 break;
448 case MBUF_SC_VO:
449 case MBUF_SC_CTL:
450 pri = FQ_IF_VO_INDEX;
451 break;
452 default:
453 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
454 break;
455 }
456 return pri;
457 }
458
459 /* scheduler is not managed by the driver */
460 switch (svc) {
461 case MBUF_SC_BK_SYS:
462 pri = FQ_IF_BK_SYS_INDEX;
463 break;
464 case MBUF_SC_BK:
465 pri = FQ_IF_BK_INDEX;
466 break;
467 case MBUF_SC_BE:
468 pri = FQ_IF_BE_INDEX;
469 break;
470 case MBUF_SC_RD:
471 pri = FQ_IF_RD_INDEX;
472 break;
473 case MBUF_SC_OAM:
474 pri = FQ_IF_OAM_INDEX;
475 break;
476 case MBUF_SC_AV:
477 pri = FQ_IF_AV_INDEX;
478 break;
479 case MBUF_SC_RV:
480 pri = FQ_IF_RV_INDEX;
481 break;
482 case MBUF_SC_VI:
483 pri = FQ_IF_VI_INDEX;
484 break;
485 case MBUF_SC_SIG:
486 pri = FQ_IF_SIG_INDEX;
487 break;
488 case MBUF_SC_VO:
489 pri = FQ_IF_VO_INDEX;
490 break;
491 case MBUF_SC_CTL:
492 pri = FQ_IF_CTL_INDEX;
493 break;
494 default:
495 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
496 break;
497 }
498 return pri;
499 }
500
501 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)502 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
503 uint32_t drr_max, uint32_t svc_class)
504 {
505 fq_if_classq_t *fq_cl;
506 VERIFY(pri < FQ_IF_MAX_CLASSES);
507 fq_cl = &fqg->fqg_classq[pri];
508
509 VERIFY(fq_cl->fcl_quantum == 0);
510 VERIFY(quantum != 0);
511 fq_cl->fcl_quantum = quantum;
512 fq_cl->fcl_pri = pri;
513 fq_cl->fcl_drr_max = drr_max;
514 fq_cl->fcl_service_class = svc_class;
515 fq_cl->fcl_next_tx_time = 0;
516 fq_cl->fcl_flags = 0;
517 STAILQ_INIT(&fq_cl->fcl_new_flows);
518 STAILQ_INIT(&fq_cl->fcl_old_flows);
519 }
520
521 int
fq_if_enqueue_classq(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)522 fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
523 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
524 {
525 uint8_t pri, grp_idx = 0;
526 fq_if_t *fqs;
527 fq_if_classq_t *fq_cl;
528 fq_if_group_t *fq_group;
529 int ret;
530 mbuf_svc_class_t svc;
531 pktsched_pkt_t pkt;
532
533 pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
534
535 fqs = (fq_if_t *)ifq->ifcq_disc;
536 svc = pktsched_get_pkt_svc(&pkt);
537 #if SKYWALK
538 if (head->cp_ptype == QP_PACKET) {
539 grp_idx = head->cp_kpkt->pkt_qset_idx;
540 }
541 #endif /* SKYWALK */
542 pri = fq_if_service_to_priority(fqs, svc);
543 VERIFY(pri < FQ_IF_MAX_CLASSES);
544
545 IFCQ_LOCK_SPIN(ifq);
546 fq_group = fq_if_find_grp(fqs, grp_idx);
547 fq_cl = &fq_group->fqg_classq[pri];
548
549 if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
550 IFCQ_UNLOCK(ifq);
551 /* BK_SYS is currently throttled */
552 os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
553 if (__improbable(droptap_verbose > 0)) {
554 pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
555 __func__, __LINE__, 0);
556 } else {
557 pktsched_free_pkt(&pkt);
558 }
559 *pdrop = TRUE;
560 ret = EQSUSPENDED;
561 goto done;
562 }
563
564 ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
565 ret = fq_addq(fqs, fq_group, &pkt, fq_cl);
566 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
567 if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
568 (1 << pri)) == 0) {
569 /*
570 * this group is not in ER or EB groups,
571 * mark it as IB
572 */
573 pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
574 }
575 }
576
577 if (__improbable(ret != 0)) {
578 if (ret == CLASSQEQ_SUCCESS_FC) {
579 /* packet enqueued, return advisory feedback */
580 ret = EQFULL;
581 *pdrop = FALSE;
582 } else if (ret == CLASSQEQ_COMPRESSED) {
583 ret = 0;
584 *pdrop = FALSE;
585 } else {
586 IFCQ_UNLOCK(ifq);
587 *pdrop = TRUE;
588 pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_FULL,
589 __func__, __LINE__, 0);
590 switch (ret) {
591 case CLASSQEQ_DROP:
592 ret = ENOBUFS;
593 goto done;
594 case CLASSQEQ_DROP_FC:
595 ret = EQFULL;
596 goto done;
597 case CLASSQEQ_DROP_SP:
598 ret = EQSUSPENDED;
599 goto done;
600 default:
601 VERIFY(0);
602 /* NOTREACHED */
603 __builtin_unreachable();
604 }
605 /* NOTREACHED */
606 __builtin_unreachable();
607 }
608 } else {
609 *pdrop = FALSE;
610 }
611 IFCQ_ADD_LEN(ifq, cnt);
612 IFCQ_INC_BYTES(ifq, bytes);
613
614
615 FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
616 FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
617
618 IFCQ_UNLOCK(ifq);
619 done:
620 #if DEBUG || DEVELOPMENT
621 if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
622 ret = 0;
623 }
624 #endif /* DEBUG || DEVELOPMENT */
625 return ret;
626 }
627
628 void
fq_if_dequeue_classq(struct ifclassq * ifq,classq_pkt_t * pkt,uint8_t grp_idx)629 fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx)
630 {
631 (void) fq_if_dequeue_classq_multi(ifq, 1,
632 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
633 }
634
635 void
fq_if_dequeue_sc_classq(struct ifclassq * ifq,mbuf_svc_class_t svc,classq_pkt_t * pkt,uint8_t grp_idx)636 fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
637 classq_pkt_t *pkt, uint8_t grp_idx)
638 {
639 (void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1,
640 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx);
641 }
642
643 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)644 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
645 {
646 ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
647 ASSERT(!fq->fq_in_dqlist);
648 STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
649 fq->fq_in_dqlist = true;
650 }
651
652 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)653 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
654 classq_pkt_t *tail, classq_pkt_type_t ptype)
655 {
656 ASSERT(fq->fq_in_dqlist);
657 if (fq->fq_dq_head.cp_mbuf == NULL) {
658 goto done;
659 }
660
661 if (head->cp_mbuf == NULL) {
662 *head = fq->fq_dq_head;
663 } else {
664 ASSERT(tail->cp_mbuf != NULL);
665
666 switch (ptype) {
667 case QP_MBUF:
668 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
669 tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
670 ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
671 break;
672 #if SKYWALK
673 case QP_PACKET:
674 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
675 tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
676 ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
677 break;
678 #endif /* SKYWALK */
679 default:
680 VERIFY(0);
681 /* NOTREACHED */
682 __builtin_unreachable();
683 }
684 }
685 *tail = fq->fq_dq_tail;
686 done:
687 STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
688 CLASSQ_PKT_INIT(&fq->fq_dq_head);
689 CLASSQ_PKT_INIT(&fq->fq_dq_tail);
690 fq->fq_in_dqlist = false;
691 }
692
693 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)694 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
695 classq_pkt_t *tail, classq_pkt_type_t ptype)
696 {
697 fq_t *fq, *tfq;
698
699 STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
700 fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
701 }
702 }
703
704 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)705 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
706 fq_if_group_t **selected_grp)
707 {
708 #pragma unused(pri)
709
710 fq_if_group_t *grp;
711 uint32_t highest_pri = FQ_IF_MAX_CLASSES;
712 int ret_pri = 0;
713
714 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
715 uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
716 /* bitmap is empty in this case */
717 if (cur_pri == 0) {
718 continue;
719 }
720 if (cur_pri <= highest_pri) {
721 highest_pri = cur_pri;
722 ret_pri = cur_pri;
723 *selected_grp = grp;
724 }
725 }
726 return ret_pri;
727 }
728
729 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)730 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
731 {
732 #pragma unused(pri)
733
734 fq_if_group_t *grp;
735
736 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
737 if (grp->fqg_bitmaps[state] != 0) {
738 return FALSE;
739 }
740 }
741 return TRUE;
742 }
743
744 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)745 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
746 fq_if_state src_state)
747 {
748 #pragma unused(pri)
749
750 fq_if_group_t *grp;
751 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
752 grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
753 }
754 }
755
756 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)757 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
758 {
759 #pragma unused(pri)
760
761 fq_if_group_t *grp;
762 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
763 grp->fqg_bitmaps[state] = 0;
764 }
765 }
766
767 static void
fq_if_grps_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)768 fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
769 fq_if_state src_state)
770 {
771 #pragma unused(pri)
772
773 fq_if_group_t *grp;
774 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
775 grp->fqg_bitmaps[dst_state] =
776 grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
777 grp->fqg_bitmaps[src_state] = 0;
778 }
779 }
780
781 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)782 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
783 fq_if_group_t **selected_grp)
784 {
785 fq_if_group_t *grp;
786 int ret_pri = 0;
787
788 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
789 if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
790 /* +1 to match the semantics of pktsched_ffs */
791 ret_pri = pri + 1;
792 *selected_grp = grp;
793 break;
794 }
795 }
796
797 return ret_pri;
798 }
799
800 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)801 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
802 {
803 fq_if_group_t *grp;
804
805 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
806 if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
807 return FALSE;
808 }
809 }
810 return TRUE;
811 }
812
813 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)814 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
815 fq_if_state src_state)
816 {
817 fq_if_group_t *grp;
818
819 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
820 pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
821 &grp->fqg_bitmaps[src_state]);
822 }
823 }
824
825 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)826 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
827 {
828 fq_if_group_t *grp;
829
830 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
831 pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
832 }
833 }
834
835 static void
fq_if_grps_sc_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)836 fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
837 fq_if_state src_state)
838 {
839 fq_if_group_t *grp;
840
841 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
842 pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state],
843 &grp->fqg_bitmaps[src_state]);
844 pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]);
845 }
846 }
847
848 /*
849 * Pacemaker is only scheduled when no packet can be dequeued from AQM
850 * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
851 * This only applies to L4S traffic at this moment.
852 */
853 static void
fq_if_schedule_pacemaker(fq_if_t * fqs,uint64_t now,uint64_t next_tx_time)854 fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
855 {
856 uint64_t deadline = 0;
857 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
858 return;
859 }
860 ASSERT(next_tx_time != FQ_INVALID_TX_TS);
861 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
862 ASSERT(now < next_tx_time);
863
864 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
865 uint64_t, next_tx_time - now);
866 KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
867 next_tx_time, next_tx_time - now);
868
869 clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline);
870 thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline);
871 }
872
873 static int
fq_if_dequeue_classq_multi_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)874 fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
875 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
876 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
877 uint8_t grp_idx)
878 {
879 uint32_t total_pktcnt = 0, total_bytecnt = 0;
880 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
881 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
882 classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
883 fq_if_append_pkt_t append_pkt;
884 flowq_dqlist_t fq_dqlist_head;
885 fq_if_classq_t *fq_cl;
886 fq_grp_tailq_t *grp_list, tmp_grp_list;
887 fq_if_group_t *__single fq_grp = NULL;
888 fq_if_t *fqs;
889 uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
890 int pri = 0, svc_pri = 0;
891 bool all_paced = true;
892
893 IFCQ_LOCK_ASSERT_HELD(ifq);
894
895 fqs = (fq_if_t *)ifq->ifcq_disc;
896 STAILQ_INIT(&fq_dqlist_head);
897
898 switch (fqs->fqs_ptype) {
899 case QP_MBUF:
900 append_pkt = fq_if_append_mbuf;
901 break;
902
903 #if SKYWALK
904 case QP_PACKET:
905 append_pkt = fq_if_append_pkt;
906 break;
907 #endif /* SKYWALK */
908
909 default:
910 VERIFY(0);
911 /* NOTREACHED */
912 __builtin_unreachable();
913 }
914
915 now = fq_codel_get_time();
916 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
917 svc_pri = fq_if_service_to_priority(fqs, svc);
918 } else {
919 VERIFY(svc == MBUF_SC_UNSPEC);
920 }
921
922 if (fq_if_is_grp_combined(fqs, grp_idx)) {
923 grp_list = &fqs->fqs_combined_grp_list;
924 VERIFY(!TAILQ_EMPTY(grp_list));
925 } else {
926 grp_list = &tmp_grp_list;
927 fq_grp = fq_if_find_grp(fqs, grp_idx);
928 TAILQ_INIT(grp_list);
929 TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
930 }
931
932 for (;;) {
933 uint32_t pktcnt = 0, bytecnt = 0;
934 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
935 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
936 bool fq_cl_all_paced = false;
937 uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
938
939 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
940 fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
941 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
942 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
943 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
944 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
945 /*
946 * Move fq_cl in IR back to ER, so that they will inspected with priority
947 * the next time the driver dequeues
948 */
949 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
950 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
951 }
952 break;
953 }
954 }
955 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
956 if (pri == 0) {
957 /*
958 * There are no ER flows, move the highest
959 * priority one from EB if there are any in that
960 * category
961 */
962 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
963 VERIFY(pri > 0);
964 VERIFY(fq_grp != NULL);
965 pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
966 pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
967 }
968 VERIFY(fq_grp != NULL);
969 pri--; /* index starts at 0 */
970 fq_cl = &fq_grp->fqg_classq[pri];
971
972 if (fq_cl->fcl_budget <= 0) {
973 /* Update the budget */
974 fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
975 fq_cl->fcl_stat.fcl_flows_cnt) *
976 fq_cl->fcl_quantum);
977 if (fq_cl->fcl_budget <= 0) {
978 goto state_change;
979 }
980 }
981 fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
982 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
983 &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
984 &fq_cl_next_tx_time);
985 if (head.cp_mbuf != NULL) {
986 ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
987 if (first.cp_mbuf == NULL) {
988 first = head;
989 } else {
990 ASSERT(last.cp_mbuf != NULL);
991 append_pkt(&last, &head);
992 }
993 last = tail;
994 append_pkt(&last, &tmp);
995 }
996 if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
997 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
998 next_tx_time = fq_cl_next_tx_time;
999 }
1000 fq_cl->fcl_budget -= bytecnt;
1001 total_pktcnt += pktcnt;
1002 total_bytecnt += bytecnt;
1003
1004 /*
1005 * If the class has exceeded the budget but still has data
1006 * to send, move it to IB
1007 */
1008 state_change:
1009 VERIFY(fq_grp != NULL);
1010 all_paced &= fq_cl_all_paced;
1011 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1012 if (fq_cl->fcl_budget <= 0) {
1013 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1014 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1015 } else if (fq_cl_all_paced) {
1016 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1017 /*
1018 * If a fq_cl still has budget but only paced queues, park it
1019 * to IR so that we will not keep loopping over it
1020 */
1021 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]);
1022 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1023 }
1024 }
1025 } else {
1026 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1027 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1028 fq_grp->fqg_bitmaps[FQ_IF_EB] |
1029 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1030 fq_cl->fcl_budget = 0;
1031 }
1032 if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1033 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1034 /*
1035 * Move fq_cl in IR back to ER, so that they will inspected with priority
1036 * the next time the driver dequeues
1037 */
1038 fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1039 }
1040 break;
1041 }
1042 }
1043
1044 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1045 TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1046 VERIFY(TAILQ_EMPTY(grp_list));
1047 }
1048
1049 fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
1050 fqs->fqs_ptype);
1051
1052 if (__probable(first_packet != NULL)) {
1053 *first_packet = first;
1054 }
1055 if (last_packet != NULL) {
1056 *last_packet = last;
1057 }
1058 if (retpktcnt != NULL) {
1059 *retpktcnt = total_pktcnt;
1060 }
1061 if (retbytecnt != NULL) {
1062 *retbytecnt = total_bytecnt;
1063 }
1064 if (next_tx_time != FQ_INVALID_TX_TS) {
1065 ASSERT(next_tx_time > now);
1066 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1067 }
1068
1069 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1070 fq_if_purge_empty_flow_list(fqs, now, false);
1071 return 0;
1072 }
1073
1074 int
fq_if_dequeue_classq_multi(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1075 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
1076 u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1077 classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1078 u_int32_t *retbytecnt, uint8_t grp_idx)
1079 {
1080 return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1081 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1082 }
1083
1084 int
fq_if_dequeue_sc_classq_multi(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1085 fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
1086 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1087 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1088 uint8_t grp_idx)
1089 {
1090 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1091
1092 if (fq_if_is_grp_combined(fqs, grp_idx)) {
1093 return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt,
1094 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1095 } else {
1096 /*
1097 * take a shortcut here since there is no need to schedule
1098 * one single service class.
1099 */
1100 return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt,
1101 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1102 }
1103 }
1104
1105 static int
fq_if_dequeue_sc_classq_multi_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1106 fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1107 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1108 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1109 uint8_t grp_idx)
1110 {
1111 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1112 uint8_t pri;
1113 u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1114 fq_if_classq_t *fq_cl;
1115 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1116 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1117 fq_if_append_pkt_t append_pkt;
1118 flowq_dqlist_t fq_dqlist_head;
1119 fq_if_group_t *fq_grp;
1120 uint64_t now;
1121
1122 switch (fqs->fqs_ptype) {
1123 case QP_MBUF:
1124 append_pkt = fq_if_append_mbuf;
1125 break;
1126
1127 #if SKYWALK
1128 case QP_PACKET:
1129 append_pkt = fq_if_append_pkt;
1130 break;
1131 #endif /* SKYWALK */
1132
1133 default:
1134 VERIFY(0);
1135 /* NOTREACHED */
1136 __builtin_unreachable();
1137 }
1138
1139 STAILQ_INIT(&fq_dqlist_head);
1140 now = fq_codel_get_time();
1141
1142 pri = fq_if_service_to_priority(fqs, svc);
1143 fq_grp = fq_if_find_grp(fqs, grp_idx);
1144 fq_cl = &fq_grp->fqg_classq[pri];
1145
1146 /*
1147 * Now we have the queue for a particular service class. We need
1148 * to dequeue as many packets as needed, first from the new flows
1149 * and then from the old flows.
1150 */
1151 while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1152 fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1153 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1154 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1155 u_int32_t pktcnt = 0, bytecnt = 0;
1156 bool all_paced = false;
1157 uint64_t next_tx_time = FQ_INVALID_TX_TS;
1158
1159 fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1160 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1161 &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1162 if (head.cp_mbuf != NULL) {
1163 if (first.cp_mbuf == NULL) {
1164 first = head;
1165 } else {
1166 ASSERT(last.cp_mbuf != NULL);
1167 append_pkt(&last, &head);
1168 }
1169 last = tail;
1170 }
1171 total_pktcnt += pktcnt;
1172 total_bytecnt += bytecnt;
1173
1174 if (next_tx_time != FQ_INVALID_TX_TS) {
1175 ASSERT(next_tx_time > now);
1176 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1177 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1178 break;
1179 }
1180 }
1181
1182 /*
1183 * Mark classq as IB if it's not idle, so that we can
1184 * start without re-init the bitmaps when it's switched
1185 * to combined mode.
1186 */
1187 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1188 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1189 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1190 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1191 } else {
1192 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1193 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1194 fq_grp->fqg_bitmaps[FQ_IF_EB] |
1195 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1196 }
1197
1198 fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1199
1200 if (__probable(first_packet != NULL)) {
1201 *first_packet = first;
1202 }
1203 if (last_packet != NULL) {
1204 *last_packet = last;
1205 }
1206 if (retpktcnt != NULL) {
1207 *retpktcnt = total_pktcnt;
1208 }
1209 if (retbytecnt != NULL) {
1210 *retbytecnt = total_bytecnt;
1211 }
1212
1213 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1214 fq_if_purge_empty_flow_list(fqs, now, false);
1215 return 0;
1216 }
1217
1218 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1219 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1220 uint32_t *bytesp, uint64_t now)
1221 {
1222 fq_if_classq_t *fq_cl;
1223 u_int32_t pkts, bytes;
1224 pktsched_pkt_t pkt;
1225 fq_if_group_t *grp;
1226
1227 fq_cl = &FQ_CLASSQ(fq);
1228 grp = FQ_GROUP(fq);
1229 pkts = bytes = 0;
1230 _PKTSCHED_PKT_INIT(&pkt);
1231 for (;;) {
1232 fq_getq_flow(fqs, fq, &pkt, now);
1233 if (pkt.pktsched_pkt_mbuf == NULL) {
1234 VERIFY(pkt.pktsched_ptype == QP_INVALID);
1235 break;
1236 }
1237 pkts++;
1238 bytes += pktsched_get_pkt_len(&pkt);
1239 if (__improbable(droptap_verbose > 0)) {
1240 pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_PURGE_FLOW,
1241 __func__, __LINE__, 0);
1242 } else {
1243 pktsched_free_pkt(&pkt);
1244 }
1245 }
1246 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1247 AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1248
1249 IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1250
1251 /* move through the flow queue states */
1252 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1253 if (fq->fq_flags & FQF_NEW_FLOW) {
1254 fq_if_empty_new_flow(fq, fq_cl);
1255 }
1256 if (fq->fq_flags & FQF_OLD_FLOW) {
1257 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1258 }
1259 if (fq->fq_flags & FQF_EMPTY_FLOW) {
1260 fq_if_purge_empty_flow(fqs, fq);
1261 fq = NULL;
1262 }
1263
1264 if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1265 int i;
1266 for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1267 pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1268 }
1269 }
1270
1271 if (pktsp != NULL) {
1272 *pktsp = pkts;
1273 }
1274 if (bytesp != NULL) {
1275 *bytesp = bytes;
1276 }
1277 }
1278
1279 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1280 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1281 {
1282 fq_t *fq, *tfq;
1283 uint64_t now;
1284
1285 now = fq_codel_get_time();
1286 /*
1287 * Take each flow from new/old flow list and flush mbufs
1288 * in that flow
1289 */
1290 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1291 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1292 }
1293 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1294 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1295 }
1296 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1297 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1298
1299 STAILQ_INIT(&fq_cl->fcl_new_flows);
1300 STAILQ_INIT(&fq_cl->fcl_old_flows);
1301 fq_cl->fcl_budget = 0;
1302 }
1303
1304 static void
fq_if_purge(fq_if_t * fqs)1305 fq_if_purge(fq_if_t *fqs)
1306 {
1307 uint64_t now;
1308 fq_if_group_t *grp;
1309 int i;
1310
1311 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1312 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1313 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1314 continue;
1315 }
1316
1317 grp = fq_if_find_grp(fqs, grp_idx);
1318 fq_if_purge_grp(fqs, grp);
1319 }
1320
1321 now = fq_codel_get_time();
1322 fq_if_purge_empty_flow_list(fqs, now, true);
1323
1324 VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1325 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1326
1327 fqs->fqs_large_flow = NULL;
1328 for (i = 0; i < fqs->fqs_flows_count; i++) {
1329 VERIFY(LIST_EMPTY(&fqs->fqs_flows[i]));
1330 }
1331
1332 IFCQ_LEN(fqs->fqs_ifq) = 0;
1333 IFCQ_BYTES(fqs->fqs_ifq) = 0;
1334 }
1335
1336 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1337 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1338 {
1339 fq_t *fq;
1340 uint64_t now;
1341 fq_if_group_t *grp;
1342
1343 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1344 req->packets = req->bytes = 0;
1345 VERIFY(req->flow != 0);
1346
1347 now = fq_codel_get_time();
1348
1349 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1350 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1351 continue;
1352 }
1353 uint32_t bytes = 0, pkts = 0;
1354
1355 grp = fq_if_find_grp(fqs, grp_idx);
1356 /*
1357 * Packet and traffic type are needed only if we want
1358 * to create a flow queue.
1359 */
1360 fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C);
1361 if (fq != NULL) {
1362 fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1363 req->bytes += bytes;
1364 req->packets += pkts;
1365 }
1366 }
1367 }
1368
1369 static uint32_t
fq_if_calc_quantum(struct ifnet * ifp)1370 fq_if_calc_quantum(struct ifnet *ifp)
1371 {
1372 uint32_t quantum;
1373
1374 switch (ifp->if_family) {
1375 case IFNET_FAMILY_ETHERNET:
1376 VERIFY(ifp->if_mtu <= IF_MAXMTU);
1377 quantum = ifp->if_mtu + ETHER_HDR_LEN;
1378 break;
1379
1380 case IFNET_FAMILY_CELLULAR:
1381 case IFNET_FAMILY_IPSEC:
1382 case IFNET_FAMILY_UTUN:
1383 VERIFY(ifp->if_mtu <= UINT16_MAX);
1384 quantum = ifp->if_mtu;
1385 break;
1386
1387 default:
1388 quantum = FQ_CODEL_DEFAULT_QUANTUM;
1389 break;
1390 }
1391
1392 if ((ifp->if_hwassist & IFNET_TSOF) != 0) {
1393 VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1394 VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1395 quantum = MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1396 quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1397 }
1398
1399 quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1400 #if DEBUG || DEVELOPMENT
1401 quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1402 #endif /* DEBUG || DEVELOPMENT */
1403 VERIFY(quantum != 0);
1404 return quantum;
1405 }
1406
1407 static void
fq_if_mtu_update(fq_if_t * fqs)1408 fq_if_mtu_update(fq_if_t *fqs)
1409 {
1410 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q) \
1411 (_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \
1412 FQ_CODEL_QUANTUM_ ## _s(_q) \
1413
1414 uint32_t quantum;
1415 fq_if_group_t *grp;
1416
1417 quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1418
1419 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1420 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1421 continue;
1422 }
1423
1424 grp = fq_if_find_grp(fqs, grp_idx);
1425
1426 if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1427 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1428 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1429 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1430 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1431 } else {
1432 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1433 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1434 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1435 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1436 _FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1437 _FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1438 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1439 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1440 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1441 _FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1442 }
1443 }
1444 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1445 }
1446
1447 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1448 fq_if_event(fq_if_t *fqs, cqev_t ev)
1449 {
1450 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1451
1452 switch (ev) {
1453 case CLASSQ_EV_LINK_UP:
1454 case CLASSQ_EV_LINK_DOWN:
1455 fq_if_purge(fqs);
1456 break;
1457 case CLASSQ_EV_LINK_MTU:
1458 fq_if_mtu_update(fqs);
1459 break;
1460 default:
1461 break;
1462 }
1463 }
1464
1465 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1466 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1467 {
1468 fq_if_purge_classq(fqs, fq_cl);
1469 fqs->fqs_throttle = 1;
1470 fq_cl->fcl_stat.fcl_throttle_on++;
1471 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1472 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1473 }
1474
1475 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1476 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1477 {
1478 VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1479 fqs->fqs_throttle = 0;
1480 fq_cl->fcl_stat.fcl_throttle_off++;
1481 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1482 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1483 }
1484
1485
1486 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1487 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1488 {
1489 struct ifclassq *ifq = fqs->fqs_ifq;
1490 uint8_t index;
1491 fq_if_group_t *grp;
1492
1493 #if !MACH_ASSERT
1494 #pragma unused(ifq)
1495 #endif
1496 IFCQ_LOCK_ASSERT_HELD(ifq);
1497
1498 if (!tr->set) {
1499 tr->level = fqs->fqs_throttle;
1500 return 0;
1501 }
1502
1503 if (tr->level == fqs->fqs_throttle) {
1504 return EALREADY;
1505 }
1506
1507 /* Throttling is allowed on BK_SYS class only */
1508 index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1509
1510 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1511 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1512 continue;
1513 }
1514 grp = fq_if_find_grp(fqs, grp_idx);
1515 switch (tr->level) {
1516 case IFNET_THROTTLE_OFF:
1517 fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1518 break;
1519 case IFNET_THROTTLE_OPPORTUNISTIC:
1520 fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1521 break;
1522 default:
1523 break;
1524 }
1525 }
1526 return 0;
1527 }
1528
1529 static inline boolean_t
fq_if_is_fq_cl_paced(fq_if_classq_t * fq_cl,uint64_t now)1530 fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1531 {
1532 if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1533 return true;
1534 }
1535
1536 fq_cl->fcl_flags &= ~FCL_PACED;
1537 fq_cl->fcl_next_tx_time = 0;
1538 return false;
1539 }
1540
1541 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat,uint64_t now)1542 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1543 {
1544 uint8_t pri;
1545 fq_if_classq_t *fq_cl;
1546
1547 ASSERT(stat != NULL);
1548 pri = fq_if_service_to_priority(fqs, stat->sc);
1549
1550 fq_cl = &grp->fqg_classq[pri];
1551 stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1552 stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1553
1554 if (ifclassq_enable_pacing && ifclassq_enable_l4s &&
1555 fq_if_is_fq_cl_paced(fq_cl, now)) {
1556 stat->packets = 0;
1557 stat->bytes = 0;
1558 }
1559 }
1560
1561 static boolean_t
fq_if_is_grp_all_paced(fq_if_group_t * grp)1562 fq_if_is_grp_all_paced(fq_if_group_t *grp)
1563 {
1564 fq_if_classq_t *fq_cl;
1565 uint64_t now;
1566
1567 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1568 return false;
1569 }
1570
1571 now = fq_codel_get_time();
1572 for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1573 fq_cl = &grp->fqg_classq[fq_cl_idx];
1574 if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1575 continue;
1576 }
1577 if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1578 return false;
1579 }
1580 }
1581
1582 return true;
1583 }
1584
1585 boolean_t
fq_if_is_all_paced(struct ifclassq * ifq)1586 fq_if_is_all_paced(struct ifclassq *ifq)
1587 {
1588 fq_if_group_t *grp;
1589 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1590
1591 IFCQ_LOCK_ASSERT_HELD(ifq);
1592
1593 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1594 return false;
1595 }
1596
1597 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1598 grp = fqs->fqs_classq_groups[grp_idx];
1599 if (grp == NULL || FQG_BYTES(grp) == 0) {
1600 continue;
1601 }
1602
1603 if (!fq_if_is_grp_all_paced(grp)) {
1604 return false;
1605 }
1606 }
1607
1608 return true;
1609 }
1610
1611 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1612 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1613 {
1614 cqrq_stat_sc_t grp_sc_stat;
1615 fq_if_group_t *grp;
1616 uint64_t now = fq_codel_get_time();
1617
1618 if (stat == NULL) {
1619 return;
1620 }
1621 grp_sc_stat.sc = stat->sc;
1622 stat->packets = 0;
1623 stat->bytes = 0;
1624
1625 if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1626 if (stat->sc == MBUF_SC_UNSPEC) {
1627 if (!fq_if_is_all_paced(fqs->fqs_ifq)) {
1628 stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1629 stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1630 }
1631 } else {
1632 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1633 grp = fqs->fqs_classq_groups[grp_idx];
1634 if (grp == NULL) {
1635 continue;
1636 }
1637
1638 fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1639 stat->packets += grp_sc_stat.packets;
1640 stat->bytes += grp_sc_stat.bytes;
1641 }
1642 }
1643 return;
1644 }
1645
1646 if (stat->sc == MBUF_SC_UNSPEC) {
1647 if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1648 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1649 if (fq_if_is_grp_all_paced(grp)) {
1650 continue;
1651 }
1652 stat->packets += FQG_LEN(grp);
1653 stat->bytes += FQG_BYTES(grp);
1654 }
1655 } else {
1656 grp = fq_if_find_grp(fqs, stat->grp_idx);
1657 if (!fq_if_is_grp_all_paced(grp)) {
1658 stat->packets = FQG_LEN(grp);
1659 stat->bytes = FQG_BYTES(grp);
1660 }
1661 }
1662 } else {
1663 if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1664 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1665 if (fq_if_is_grp_all_paced(grp)) {
1666 continue;
1667 }
1668 fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1669 stat->packets += grp_sc_stat.packets;
1670 stat->bytes += grp_sc_stat.bytes;
1671 }
1672 } else {
1673 grp = fq_if_find_grp(fqs, stat->grp_idx);
1674 fq_if_grp_stat_sc(fqs, grp, stat, now);
1675 }
1676 }
1677 }
1678
1679 int
fq_if_request_classq(struct ifclassq * ifq,cqrq_t rq,void * arg)1680 fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg)
1681 {
1682 int err = 0;
1683 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1684
1685 IFCQ_LOCK_ASSERT_HELD(ifq);
1686
1687 /*
1688 * These are usually slow operations, convert the lock ahead of time
1689 */
1690 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1691 switch (rq) {
1692 case CLASSQRQ_PURGE:
1693 fq_if_purge(fqs);
1694 break;
1695 case CLASSQRQ_PURGE_SC:
1696 fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1697 break;
1698 case CLASSQRQ_EVENT:
1699 fq_if_event(fqs, *(cqev_t *)arg);
1700 break;
1701 case CLASSQRQ_THROTTLE:
1702 fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1703 break;
1704 case CLASSQRQ_STAT_SC:
1705 fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1706 break;
1707 }
1708 return err;
1709 }
1710
1711 int
fq_if_setup_ifclassq(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)1712 fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
1713 classq_pkt_type_t ptype)
1714 {
1715 fq_if_t *fqs = NULL;
1716 int err = 0;
1717
1718 IFCQ_LOCK_ASSERT_HELD(ifq);
1719 VERIFY(ifq->ifcq_disc == NULL);
1720 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1721
1722 fqs = fq_if_alloc(ifq, ptype);
1723 if (fqs == NULL) {
1724 return ENOMEM;
1725 }
1726 if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1727 fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1728 fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1729 } else {
1730 fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1731 }
1732
1733 err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1734 if (err != 0) {
1735 os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
1736 "failed to attach fq_if: %d\n", __func__, err);
1737 fq_if_destroy(fqs);
1738 return err;
1739 }
1740
1741 /*
1742 * Always create one group. If qset 0 is added later,
1743 * this group will be updated.
1744 */
1745 err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
1746 if (err != 0) {
1747 os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
1748 "failed to create a fq group: %d\n", __func__, err);
1749 fq_if_destroy(fqs);
1750 }
1751
1752 return err;
1753 }
1754
1755 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,uint32_t flowid,mbuf_svc_class_t svc_class,uint64_t now,bool create,fq_tfc_type_t tfc_type)1756 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid,
1757 mbuf_svc_class_t svc_class, uint64_t now, bool create,
1758 fq_tfc_type_t tfc_type)
1759 {
1760 fq_t *fq = NULL;
1761 flowq_list_t *fq_list;
1762 fq_if_classq_t *fq_cl;
1763 uint32_t fqs_hash_id;
1764 u_int8_t scidx;
1765
1766 scidx = fq_if_service_to_priority(fqs, svc_class);
1767
1768 fqs_hash_id = fq_if_flow_hash_id(flowid);
1769
1770 fq_list = &fqs->fqs_flows[fqs_hash_id];
1771
1772 LIST_FOREACH(fq, fq_list, fq_hashlink) {
1773 if (fq->fq_flowhash == flowid &&
1774 fq->fq_sc_index == scidx &&
1775 fq->fq_tfc_type == tfc_type &&
1776 fq->fq_group == fq_grp) {
1777 break;
1778 }
1779 }
1780 if (fq == NULL && create) {
1781 /* If the flow is not already on the list, allocate it */
1782 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1783 fq = fq_alloc(fqs->fqs_ptype);
1784 if (fq != NULL) {
1785 fq->fq_flowhash = flowid;
1786 fq->fq_sc_index = scidx;
1787 fq->fq_group = fq_grp;
1788 fq->fq_tfc_type = tfc_type;
1789 fq_cl = &FQ_CLASSQ(fq);
1790 fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
1791 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1792 fq->fq_next_tx_time = FQ_INVALID_TX_TS;
1793 LIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
1794 fq_cl->fcl_stat.fcl_flows_cnt++;
1795 }
1796 KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
1797 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1798 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1799 } else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
1800 fq_if_reuse_empty_flow(fqs, fq, now);
1801 }
1802
1803 /*
1804 * If getq time is not set because this is the first packet or after
1805 * idle time, set it now so that we can detect a stall.
1806 */
1807 if (fq != NULL && fq->fq_getqtime == 0) {
1808 fq->fq_getqtime = now;
1809 }
1810
1811 return fq;
1812 }
1813
1814 void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)1815 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
1816 {
1817 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
1818 LIST_REMOVE(fq, fq_hashlink);
1819 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1820 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1821 fq_if_flow_feedback(fqs, fq, fq_cl);
1822 }
1823 KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
1824 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1825 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1826 fq_destroy(fq, fqs->fqs_ptype);
1827 }
1828
1829 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)1830 fq_if_at_drop_limit(fq_if_t *fqs)
1831 {
1832 return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
1833 TRUE : FALSE;
1834 }
1835
1836 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)1837 fq_if_almost_at_drop_limit(fq_if_t *fqs)
1838 {
1839 /*
1840 * Whether we are above 90% of the queue limit. This is used to tell if we
1841 * can stop flow controlling the largest flow.
1842 */
1843 return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
1844 }
1845
1846 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)1847 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
1848 {
1849 ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
1850 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1851 STAILQ_NEXT(fq, fq_actlink) = NULL;
1852 fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
1853 fq->fq_empty_purge_time = 0;
1854 fq->fq_getqtime = 0;
1855 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1856 fqs->fqs_empty_list_cnt--;
1857 fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
1858 fq_cl->fcl_stat.fcl_flows_cnt++;
1859 }
1860
1861 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1862 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1863 uint64_t now)
1864 {
1865 ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
1866 fq->fq_empty_purge_time = now + fq_empty_purge_delay;
1867 TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
1868 fq->fq_flags |= FQF_EMPTY_FLOW;
1869 FQ_CLEAR_OVERWHELMING(fq);
1870 fqs->fqs_empty_list_cnt++;
1871 /*
1872 * fcl_flows_cnt is used in budget determination for the class.
1873 * empty flow shouldn't contribute to the budget.
1874 */
1875 fq_cl->fcl_stat.fcl_flows_cnt--;
1876 }
1877
1878 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)1879 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
1880 {
1881 fq_if_classq_t *fq_cl;
1882 fq_cl = &FQ_CLASSQ(fq);
1883
1884 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
1885 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1886 fq->fq_flags &= ~FQF_EMPTY_FLOW;
1887 fqs->fqs_empty_list_cnt--;
1888 /* Remove from the hash list and free the flow queue */
1889 fq_if_destroy_flow(fqs, fq_cl, fq);
1890 }
1891
1892 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)1893 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
1894 {
1895 fq_t *fq, *tmp;
1896 int i = 0;
1897
1898 if (fqs->fqs_empty_list_cnt == 0) {
1899 ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
1900 return;
1901 }
1902
1903 TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
1904 if (!purge_all && ((now < fq->fq_empty_purge_time) ||
1905 (i++ == FQ_EMPTY_PURGE_MAX))) {
1906 break;
1907 }
1908 fq_if_purge_empty_flow(fqs, fq);
1909 }
1910
1911 if (__improbable(purge_all)) {
1912 VERIFY(fqs->fqs_empty_list_cnt == 0);
1913 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1914 }
1915 }
1916
1917 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)1918 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1919 uint64_t now)
1920 {
1921 /*
1922 * Remove the flow queue from the old flows list.
1923 */
1924 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
1925 fq->fq_flags &= ~FQF_OLD_FLOW;
1926 fq_cl->fcl_stat.fcl_oldflows_cnt--;
1927 VERIFY(fq->fq_bytes == 0);
1928
1929 /* release any flow control */
1930 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1931 fq_if_flow_feedback(fqs, fq, fq_cl);
1932 }
1933
1934 /* move the flow queue to empty flows list */
1935 fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
1936 }
1937
1938 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)1939 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
1940 {
1941 /* Move to the end of old queue list */
1942 STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
1943 flowq, fq_actlink);
1944 fq->fq_flags &= ~FQF_NEW_FLOW;
1945 fq_cl->fcl_stat.fcl_newflows_cnt--;
1946
1947 STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
1948 fq->fq_flags |= FQF_OLD_FLOW;
1949 fq_cl->fcl_stat.fcl_oldflows_cnt++;
1950 }
1951
1952 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)1953 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
1954 {
1955 fq_t *fq = fqs->fqs_large_flow;
1956 fq_if_classq_t *fq_cl;
1957 pktsched_pkt_t pkt;
1958 volatile uint32_t *__single pkt_flags;
1959 uint64_t *__single pkt_timestamp;
1960
1961 if (fq == NULL) {
1962 return;
1963 }
1964 /* queue can not be empty on the largest flow */
1965 VERIFY(!fq_empty(fq, fqs->fqs_ptype));
1966
1967 fq_cl = &FQ_CLASSQ(fq);
1968 _PKTSCHED_PKT_INIT(&pkt);
1969 fq_getq_flow_internal(fqs, fq, &pkt);
1970 ASSERT(pkt.pktsched_ptype != QP_INVALID);
1971
1972 pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
1973 NULL, NULL, NULL);
1974
1975 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1976 *pkt_timestamp = 0;
1977 switch (pkt.pktsched_ptype) {
1978 case QP_MBUF:
1979 *pkt_flags &= ~PKTF_PRIV_GUARDED;
1980 break;
1981 #if SKYWALK
1982 case QP_PACKET:
1983 /* sanity check */
1984 ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
1985 break;
1986 #endif /* SKYWALK */
1987 default:
1988 VERIFY(0);
1989 /* NOTREACHED */
1990 __builtin_unreachable();
1991 }
1992
1993 if (fq_empty(fq, fqs->fqs_ptype)) {
1994 fqs->fqs_large_flow = NULL;
1995 if (fq->fq_flags & FQF_OLD_FLOW) {
1996 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1997 } else {
1998 VERIFY(fq->fq_flags & FQF_NEW_FLOW);
1999 fq_if_empty_new_flow(fq, fq_cl);
2000 }
2001 }
2002 IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
2003
2004 if (__improbable(droptap_verbose > 0)) {
2005 pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_DROP,
2006 __func__, __LINE__, 0);
2007 } else {
2008 pktsched_free_pkt(&pkt);
2009 }
2010 fq_cl->fcl_stat.fcl_drop_overflow++;
2011 }
2012
2013 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)2014 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
2015 {
2016 fq_t *prev_fq;
2017
2018 if (fqs->fqs_large_flow != NULL &&
2019 fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2020 fqs->fqs_large_flow = NULL;
2021 }
2022
2023 if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2024 return;
2025 }
2026
2027 prev_fq = fqs->fqs_large_flow;
2028 if (prev_fq == NULL) {
2029 if (!fq_empty(fq, fqs->fqs_ptype)) {
2030 fqs->fqs_large_flow = fq;
2031 }
2032 return;
2033 } else if (fq->fq_bytes > prev_fq->fq_bytes) {
2034 fqs->fqs_large_flow = fq;
2035 }
2036 }
2037
2038 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)2039 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2040 fq_t *fq, fq_if_classq_t *fq_cl)
2041 {
2042 struct flowadv_fcentry *fce;
2043
2044 #if DEBUG || DEVELOPMENT
2045 if (__improbable(ifclassq_flow_control_adv == 0)) {
2046 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2047 return TRUE;
2048 }
2049 #endif /* DEBUG || DEVELOPMENT */
2050
2051 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2052 if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2053 fce->fce_flowid == fq->fq_flowhash) {
2054 /* Already on flowcontrol list */
2055 return TRUE;
2056 }
2057 }
2058 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2059 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2060 if (fce != NULL) {
2061 /* XXX Add number of bytes in the queue */
2062 STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2063 fq_cl->fcl_stat.fcl_flow_control++;
2064 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2065 "flow: 0x%x, iface: %s, B:%u\n", __func__,
2066 fq_cl->fcl_stat.fcl_flow_control,
2067 fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2068 if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2069 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2070 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2071 fq->fq_bytes, fq->fq_min_qdelay);
2072 }
2073
2074 if (fce != NULL && fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
2075 kern_channel_flowadv_set(fce);
2076 }
2077
2078 return (fce != NULL) ? TRUE : FALSE;
2079 }
2080
2081 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)2082 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2083 {
2084 STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2085 STAILQ_NEXT(fce, fce_link) = NULL;
2086 flowadv_add_entry(fce);
2087 }
2088
2089 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)2090 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2091 {
2092 struct flowadv_fcentry *fce = NULL;
2093
2094 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2095 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2096 if (fce->fce_flowid == fq->fq_flowhash) {
2097 break;
2098 }
2099 }
2100 if (fce != NULL) {
2101 fq_cl->fcl_stat.fcl_flow_feedback++;
2102 fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2103 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2104 "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2105 fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2106 fce->fce_flowsrc_type, fce->fce_flowid,
2107 if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2108 fq->fq_bytes);
2109 fq_if_remove_fcentry(fqs, fce);
2110 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2111 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2112 fq->fq_bytes, fq->fq_min_qdelay);
2113 }
2114 fq->fq_flags &= ~FQF_FLOWCTL_ON;
2115 }
2116
2117 boolean_t
fq_if_report_ce(fq_if_t * fqs,pktsched_pkt_t * pkt,uint32_t ce_cnt,uint32_t pkt_cnt)2118 fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt,
2119 uint32_t pkt_cnt)
2120 {
2121 struct flowadv_fcentry *fce;
2122
2123 #if DEBUG || DEVELOPMENT
2124 if (__improbable(ifclassq_flow_control_adv == 0)) {
2125 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2126 return TRUE;
2127 }
2128 #endif /* DEBUG || DEVELOPMENT */
2129
2130 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2131 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2132 if (fce != NULL) {
2133 fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2134 fce->fce_ce_cnt = ce_cnt;
2135 fce->fce_pkts_since_last_report = pkt_cnt;
2136
2137 flowadv_add_entry(fce);
2138 }
2139 return (fce != NULL) ? TRUE : FALSE;
2140 }
2141
2142
2143 void
fq_if_dequeue(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now,bool * fq_cl_paced,uint64_t * next_tx_time)2144 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2145 int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2146 uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2147 bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2148 uint64_t *next_tx_time)
2149 {
2150 fq_t *fq = NULL, *tfq = NULL;
2151 flowq_stailq_t temp_stailq;
2152 uint32_t pktcnt, bytecnt;
2153 boolean_t qempty, limit_reached = FALSE;
2154 bool all_paced = true;
2155 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2156 fq_getq_flow_t fq_getq_flow_fn;
2157 classq_pkt_t *head, *tail;
2158 uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2159
2160 switch (fqs->fqs_ptype) {
2161 case QP_MBUF:
2162 fq_getq_flow_fn = fq_getq_flow_mbuf;
2163 break;
2164
2165 #if SKYWALK
2166 case QP_PACKET:
2167 fq_getq_flow_fn = fq_getq_flow_kpkt;
2168 break;
2169 #endif /* SKYWALK */
2170
2171 default:
2172 VERIFY(0);
2173 /* NOTREACHED */
2174 __builtin_unreachable();
2175 }
2176
2177 /*
2178 * maximum byte limit should not be greater than the budget for
2179 * this class
2180 */
2181 if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2182 bytelimit = fq_cl->fcl_budget;
2183 }
2184
2185 VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2186 pktcnt = bytecnt = 0;
2187 STAILQ_INIT(&temp_stailq);
2188
2189 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2190 ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2191 FQF_NEW_FLOW);
2192 uint64_t fq_tx_time;
2193 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2194 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2195 if (fq_tx_time < fq_cl_tx_time) {
2196 fq_cl_tx_time = fq_tx_time;
2197 }
2198 continue;
2199 }
2200 all_paced = false;
2201
2202 if (fq_dqlist != NULL) {
2203 if (!fq->fq_in_dqlist) {
2204 fq_dqlist_add(fq_dqlist, fq);
2205 }
2206 head = &fq->fq_dq_head;
2207 tail = &fq->fq_dq_tail;
2208 } else {
2209 ASSERT(!fq->fq_in_dqlist);
2210 head = top;
2211 tail = &last;
2212 }
2213
2214 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2215 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2216
2217 /*
2218 * From RFC 8290:
2219 * if that queue has a negative number of credits (i.e., it has already
2220 * dequeued at least a quantum of bytes), it is given an additional
2221 * quantum of credits, the queue is put onto _the end of_ the list of
2222 * old queues, and the routine selects the next queue and starts again.
2223 */
2224 if (fq->fq_deficit <= 0 || qempty) {
2225 fq->fq_deficit += fq_cl->fcl_quantum;
2226 fq_if_empty_new_flow(fq, fq_cl);
2227 }
2228 //TODO: add credit when it's now paced? so that the fq is trated the same as empty
2229
2230 if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2231 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2232 if (fq_tx_time < fq_cl_tx_time) {
2233 fq_cl_tx_time = fq_tx_time;
2234 }
2235 }
2236
2237 if (limit_reached) {
2238 goto done;
2239 }
2240 }
2241
2242 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2243 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2244 FQF_OLD_FLOW);
2245 bool destroy = true;
2246 uint64_t fq_tx_time;
2247
2248 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2249 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2250 if (fq_tx_time < fq_cl_tx_time) {
2251 fq_cl_tx_time = fq_tx_time;
2252 }
2253 continue;
2254 }
2255 all_paced = false;
2256
2257 if (fq_dqlist != NULL) {
2258 if (!fq->fq_in_dqlist) {
2259 fq_dqlist_add(fq_dqlist, fq);
2260 }
2261 head = &fq->fq_dq_head;
2262 tail = &fq->fq_dq_tail;
2263 destroy = false;
2264 } else {
2265 ASSERT(!fq->fq_in_dqlist);
2266 head = top;
2267 tail = &last;
2268 }
2269
2270 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2271 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2272
2273 if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2274 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2275 if (fq_tx_time < fq_cl_tx_time) {
2276 fq_cl_tx_time = fq_tx_time;
2277 }
2278 }
2279
2280 if (qempty) {
2281 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2282 } else if (fq->fq_deficit <= 0) {
2283 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2284 flowq, fq_actlink);
2285 /*
2286 * Move to the end of the old queues list. We do not
2287 * need to update the flow count since this flow
2288 * will be added to the tail again
2289 */
2290 STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2291 fq->fq_deficit += fq_cl->fcl_quantum;
2292 }
2293 if (limit_reached) {
2294 break;
2295 }
2296 }
2297
2298 done:
2299 if (all_paced) {
2300 fq_cl->fcl_flags |= FCL_PACED;
2301 fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2302 }
2303 if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2304 STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2305 } else if (!STAILQ_EMPTY(&temp_stailq)) {
2306 fq_cl->fcl_old_flows = temp_stailq;
2307 }
2308 if (last.cp_mbuf != NULL) {
2309 VERIFY(top->cp_mbuf != NULL);
2310 if (bottom != NULL) {
2311 *bottom = last;
2312 }
2313 }
2314 if (retpktcnt != NULL) {
2315 *retpktcnt = pktcnt;
2316 }
2317 if (retbytecnt != NULL) {
2318 *retbytecnt = bytecnt;
2319 }
2320 if (fq_cl_paced != NULL) {
2321 *fq_cl_paced = all_paced;
2322 }
2323 if (next_tx_time != NULL) {
2324 *next_tx_time = fq_cl_tx_time;
2325 }
2326 }
2327
2328 void
fq_if_teardown_ifclassq(struct ifclassq * ifq)2329 fq_if_teardown_ifclassq(struct ifclassq *ifq)
2330 {
2331 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2332
2333 IFCQ_LOCK_ASSERT_HELD(ifq);
2334 VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL);
2335 fq_if_destroy(fqs);
2336 ifq->ifcq_disc = NULL;
2337 ifclassq_detach(ifq);
2338 }
2339
2340 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)2341 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2342 struct fq_codel_flowstats *flowstat)
2343 {
2344 bzero(flowstat, sizeof(*flowstat));
2345 flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2346 flowstat->fqst_bytes = fq->fq_bytes;
2347 flowstat->fqst_flowhash = fq->fq_flowhash;
2348 if (fq->fq_flags & FQF_NEW_FLOW) {
2349 flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2350 }
2351 if (fq->fq_flags & FQF_OLD_FLOW) {
2352 flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2353 }
2354 if (fq->fq_flags & FQF_DELAY_HIGH) {
2355 flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2356 }
2357 if (fq->fq_flags & FQF_FLOWCTL_ON) {
2358 flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2359 }
2360 if (fqs->fqs_large_flow == fq) {
2361 flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2362 }
2363 }
2364
2365 int
fq_if_getqstats_ifclassq(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2366 fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2367 struct if_ifclassq_stats *ifqs)
2368 {
2369 struct fq_codel_classstats *fcls;
2370 fq_if_classq_t *fq_cl;
2371 fq_if_t *fqs;
2372 fq_t *fq = NULL;
2373 fq_if_group_t *grp;
2374 u_int32_t i, flowstat_cnt;
2375
2376 if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2377 return EINVAL;
2378 }
2379
2380 fqs = (fq_if_t *)ifq->ifcq_disc;
2381 if (fqs->fqs_classq_groups[gid] == NULL) {
2382 return ENXIO;
2383 }
2384
2385 fcls = &ifqs->ifqs_fq_codel_stats;
2386
2387 fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2388 grp = fq_if_find_grp(fqs, gid);
2389
2390 fcls->fcls_pri = fq_cl->fcl_pri;
2391 fcls->fcls_service_class = fq_cl->fcl_service_class;
2392 fcls->fcls_quantum = fq_cl->fcl_quantum;
2393 fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2394 fcls->fcls_budget = fq_cl->fcl_budget;
2395 fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2396 fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2397 fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2398 fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2399 fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2400 fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2401 fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2402 fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2403 fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2404 fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2405 fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2406 fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2407 fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2408 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2409 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2410 fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2411 fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2412 fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2413 fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2414 fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2415 fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2416 fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2417 fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2418 fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2419 fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2420 fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2421 fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2422 fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2423 fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2424 fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2425 fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2426 fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2427 fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2428 fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2429 fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2430
2431 /* Gather per flow stats */
2432 flowstat_cnt = min((fcls->fcls_newflows_cnt +
2433 fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2434 i = 0;
2435 STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2436 if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2437 break;
2438 }
2439
2440 /* leave space for a few old flows */
2441 if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2442 i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2443 break;
2444 }
2445 fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2446 i++;
2447 }
2448 STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2449 if (i >= flowstat_cnt) {
2450 break;
2451 }
2452 fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2453 i++;
2454 }
2455 VERIFY(i <= flowstat_cnt);
2456 fcls->fcls_flowstats_cnt = i;
2457 return 0;
2458 }
2459
2460 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2461 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2462 {
2463 #define _FQ_CLASSQ_INIT(_grp, _s, _q) \
2464 fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX, \
2465 FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s), \
2466 MBUF_SC_ ## _s );
2467
2468 fq_if_group_t *grp;
2469 fq_if_t *fqs;
2470 uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2471 struct ifnet *ifp = ifcq->ifcq_ifp;
2472
2473 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2474
2475 fqs = (fq_if_t *)ifcq->ifcq_disc;
2476
2477 if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2478 grp = fqs->fqs_classq_groups[grp_idx];
2479 goto update;
2480 }
2481
2482 if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2483 return EINVAL;
2484 }
2485
2486 grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2487 if (grp == NULL) {
2488 return ENOMEM;
2489 }
2490
2491 fqs->fqs_classq_groups[grp_idx] = grp;
2492 grp->fqg_index = grp_idx;
2493
2494 quantum = fq_if_calc_quantum(ifp);
2495 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2496 _FQ_CLASSQ_INIT(grp, BK, quantum);
2497 _FQ_CLASSQ_INIT(grp, BE, quantum);
2498 _FQ_CLASSQ_INIT(grp, VI, quantum);
2499 _FQ_CLASSQ_INIT(grp, VO, quantum);
2500 } else {
2501 /* SIG shares same INDEX with VI */
2502 _CASSERT(SCIDX_SIG == SCIDX_VI);
2503 _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2504
2505 _FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2506 _FQ_CLASSQ_INIT(grp, BK, quantum);
2507 _FQ_CLASSQ_INIT(grp, BE, quantum);
2508 _FQ_CLASSQ_INIT(grp, RD, quantum);
2509 _FQ_CLASSQ_INIT(grp, OAM, quantum);
2510 _FQ_CLASSQ_INIT(grp, AV, quantum);
2511 _FQ_CLASSQ_INIT(grp, RV, quantum);
2512 _FQ_CLASSQ_INIT(grp, VI, quantum);
2513 _FQ_CLASSQ_INIT(grp, VO, quantum);
2514 _FQ_CLASSQ_INIT(grp, CTL, quantum);
2515 }
2516
2517 update:
2518 if (flags & IF_DEFAULT_GRP) {
2519 fq_if_set_grp_combined(ifcq, grp_idx);
2520 grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2521 } else {
2522 fq_if_set_grp_separated(ifcq, grp_idx);
2523 grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2524 }
2525
2526 calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2527 ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2528 calc_flags);
2529 ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2530 calc_flags | IF_CLASSQ_L4S);
2531
2532 ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2533 calc_flags);
2534 ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2535 calc_flags | IF_CLASSQ_L4S);
2536
2537 return 0;
2538 #undef _FQ_CLASSQ_INIT
2539 }
2540
2541 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2542 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2543 {
2544 fq_if_group_t *grp;
2545
2546 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2547 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2548
2549 grp = fqs->fqs_classq_groups[grp_idx];
2550 VERIFY(grp != NULL);
2551
2552 return grp;
2553 }
2554
2555 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2556 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2557 {
2558 for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2559 fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2560 }
2561
2562 bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2563 grp->fqg_len = 0;
2564 grp->fqg_bytes = 0;
2565 fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2566 }
2567
2568 void
fq_if_destroy_grps(fq_if_t * fqs)2569 fq_if_destroy_grps(fq_if_t *fqs)
2570 {
2571 fq_if_group_t *__single grp;
2572
2573 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2574
2575 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2576 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2577 continue;
2578 }
2579
2580 grp = fq_if_find_grp(fqs, grp_idx);
2581 fq_if_purge_grp(fqs, grp);
2582 zfree(fq_if_grp_zone, grp);
2583 fqs->fqs_classq_groups[grp_idx] = NULL;
2584 }
2585 }
2586
2587 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2588 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2589 {
2590 return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2591 }
2592
2593 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2594 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2595 {
2596 fq_if_t *fqs;
2597 fq_if_group_t *grp;
2598
2599 IFCQ_LOCK_ASSERT_HELD(ifcq);
2600
2601 fqs = (fq_if_t *)ifcq->ifcq_disc;
2602 grp = fq_if_find_grp(fqs, grp_idx);
2603
2604 if (fq_if_is_grp_combined(fqs, grp_idx)) {
2605 return;
2606 }
2607
2608 /*
2609 * We keep the current fq_deficit and fcl_budget when combining a group.
2610 * That might disrupt the AQM but only for a moment.
2611 */
2612 pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2613 TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2614 }
2615
2616 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2617 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2618 {
2619 fq_if_t *fqs;
2620 fq_if_group_t *grp;
2621
2622 IFCQ_LOCK_ASSERT_HELD(ifcq);
2623
2624 fqs = (fq_if_t *)ifcq->ifcq_disc;
2625 grp = fq_if_find_grp(fqs, grp_idx);
2626
2627 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2628 return;
2629 }
2630
2631 pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2632 TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2633 }
2634