1 /*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <kern/zalloc.h>
32 #include <net/ethernet.h>
33 #include <net/if_var.h>
34 #include <net/if.h>
35 #include <net/droptap.h>
36 #include <net/classq/classq.h>
37 #include <net/classq/classq_fq_codel.h>
38 #include <net/pktsched/pktsched_ops.h>
39 #include <net/pktsched/pktsched_fq_codel.h>
40 #include <os/log.h>
41 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
42 #include <mach/thread_act.h>
43 #include <kern/thread.h>
44 #include <kern/sched_prim.h>
45
46 #include <skywalk/core/skywalk_var.h>
47
48 #define FQ_CODEL_DEFAULT_QUANTUM 1500
49
50 #define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q)
51 #define FQ_CODEL_QUANTUM_BK(_q) (_q)
52 #define FQ_CODEL_QUANTUM_BE(_q) (_q)
53 #define FQ_CODEL_QUANTUM_RD(_q) (_q)
54 #define FQ_CODEL_QUANTUM_OAM(_q) (_q)
55 #define FQ_CODEL_QUANTUM_AV(_q) (_q * 2)
56 #define FQ_CODEL_QUANTUM_RV(_q) (_q * 2)
57 #define FQ_CODEL_QUANTUM_VI(_q) (_q * 2)
58 #define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5)
59 #define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5)
60
61 #define IFQ_DEF_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */
62 #define IFQ_DEF_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */
63 #define IFQ_DEF_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */
64 #define IFQ_DEF_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */
65 #define IFQ_DEF_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */
66 #define IFQ_LL_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */
67 #define IFQ_LL_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */
68 #define IFQ_LL_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */
69 #define IFQ_LL_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */
70 #define IFQ_LL_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */
71
72 static uint64_t fq_if_def_c_target_qdelay = 0;
73 static uint64_t fq_if_def_c_update_interval = 0;
74 static uint64_t fq_if_def_l4s_target_qdelay = 0;
75 static uint64_t fq_if_def_l4s_update_interval = 0;
76 static uint64_t fq_if_ll_c_target_qdelay = 0;
77 static uint64_t fq_if_ll_c_update_interval = 0;
78 static uint64_t fq_if_ll_l4s_target_qdelay = 0;
79 static uint64_t fq_if_ll_l4s_update_interval = 0;
80
81 uint32_t fq_codel_quantum = 0;
82
83 static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
84 static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
85
86 SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
87 0, "FQ-CODEL parameters");
88
89 static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
90 #if (DEVELOPMENT || DEBUG)
91 SYSCTL_EXTENSIBLE_NODE(_net_classq_fq_codel, OID_AUTO, params,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "classq fq codel parameters");
93
94 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
95 CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
96
97 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED,
98 &fq_if_def_c_target_qdelay, "classic target queue delay in nanoseconds");
99
100 SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, update_interval,
101 CTLFLAG_RW | CTLFLAG_LOCKED, &fq_if_def_c_update_interval,
102 "classic update interval in nanoseconds");
103 #endif /* !DEVELOPMENT && !DEBUG */
104
105 unsigned int fq_codel_enable_pacing = 1;
106 SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
107 &fq_codel_enable_pacing, 0, "Enable pacing");
108
109 uint32_t fq_codel_enable_l4s = 1;
110 SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_l4s,
111 CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_l4s, 0,
112 "enable/disable L4S");
113
114 uint32_t fq_codel_enable_ecn = 0;
115 SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_ecn,
116 CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_ecn, 0,
117 "enable/disable ECN for classic traffic");
118
119 typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
120
121 static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
122 static void fq_if_destroy(fq_if_t *fqs);
123 static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
124 uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
125 static void fq_if_dequeue_class(fq_if_t *, fq_if_classq_t *, uint32_t,
126 int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
127 uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
128 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
129 static void fq_if_purge(fq_if_t *);
130 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
131 static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
132 uint64_t);
133 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
134 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
135 fq_t *fq, uint64_t now);
136 static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
137 static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
138 bool purge_all);
139 static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
140 static int fq_if_dequeue_sc_separate(struct ifclassq *ifq,
141 mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
142 classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
143 u_int32_t *retbytecnt, uint8_t grp_idx);
144 static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
145 cqrq_stat_sc_t *stat, uint64_t now);
146 static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
147 static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
148 static void fq_if_destroy_grps(fq_if_t *fqs);
149 static void fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx);
150 static void fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx);
151 static void fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay,
152 uint32_t flags);
153 static void fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags);
154
155 uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
156 [FQ_IF_CTL_INDEX] = 8,
157 [FQ_IF_VO_INDEX] = 8,
158 [FQ_IF_VI_INDEX] = 6,
159 [FQ_IF_RV_INDEX] = 6,
160 [FQ_IF_AV_INDEX] = 6,
161 [FQ_IF_OAM_INDEX] = 4,
162 [FQ_IF_RD_INDEX] = 4,
163 [FQ_IF_BE_INDEX] = 4,
164 [FQ_IF_BK_INDEX] = 2,
165 [FQ_IF_BK_SYS_INDEX] = 2,
166 };
167
168 #define FQ_CODEL_DRR_MAX(_s) fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
169
170 static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
171 fq_if_state state);
172 static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
173 fq_if_state dst_state, fq_if_state src_state);
174 static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
175 fq_if_state state);
176 static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
177 fq_if_state state, fq_if_group_t **selected_grp);
178 static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
179 fq_if_state dst_state, fq_if_state src_state);
180
181 static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
182 fq_if_state state);
183 static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
184 fq_if_state dst_state, fq_if_state src_state);
185 static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
186 fq_if_state state);
187 static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
188 fq_if_state state, fq_if_group_t **selected_grp);
189 static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
190 fq_if_state dst_state, fq_if_state src_state);
191
192 void fq_if_teardown(struct ifclassq *ifq);
193 int fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg);
194 int fq_if_getqstats(struct ifclassq *ifq, uint8_t gid,
195 u_int32_t qid, struct if_ifclassq_stats *ifqs);
196 int fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *h,
197 classq_pkt_t *t, uint32_t cnt, uint32_t bytes, boolean_t *pdrop);
198 int fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt,
199 u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet,
200 u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx);
201 int fq_if_dequeue_sc(struct ifclassq *ifq,
202 mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
203 classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
204 u_int32_t *retbytecnt, uint8_t grp_idx);
205 int fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags,
206 classq_pkt_type_t ptype);
207 boolean_t fq_if_allow_dequeue(struct ifclassq *ifq);
208
209 int fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags,
210 classq_pkt_type_t ptype);
211
212 bitmap_ops_t fq_if_grps_bitmap_ops =
213 {
214 .ffs = fq_if_grps_bitmap_ffs,
215 .zeros = fq_if_grps_bitmap_zeros,
216 .cpy = fq_if_grps_bitmap_cpy,
217 .clr = fq_if_grps_bitmap_clr,
218 .move = fq_if_grps_bitmap_move,
219 };
220
221 bitmap_ops_t fq_if_grps_sc_bitmap_ops =
222 {
223 .ffs = fq_if_grps_sc_bitmap_ffs,
224 .zeros = fq_if_grps_sc_bitmap_zeros,
225 .cpy = fq_if_grps_sc_bitmap_cpy,
226 .clr = fq_if_grps_sc_bitmap_clr,
227 .move = fq_if_grps_sc_bitmap_move,
228 };
229
230 static uint32_t fq_if_hash_table_size;
231
232 struct pktsched_ops fq_codel_classq_ops = {
233 .ps_id = PKTSCHEDT_FQ_CODEL,
234 .ps_setup = fq_if_setup_legacy,
235 .ps_teardown = fq_if_teardown,
236 .ps_enq = fq_if_enqueue,
237 .ps_deq = fq_if_dequeue,
238 .ps_deq_sc = fq_if_dequeue_sc,
239 .ps_req = fq_if_request,
240 .ps_stats = fq_if_getqstats,
241 .ps_allow_dequeue = fq_if_allow_dequeue,
242 };
243
244 struct pktsched_ops new_fq_codel_classq_ops = {
245 .ps_id = PKTSCHEDT_FQ_CODEL_NEW,
246 .ps_setup = fq_if_setup_new,
247 .ps_teardown = fq_if_teardown,
248 .ps_enq = fq_if_enqueue,
249 .ps_deq = fq_if_dequeue,
250 .ps_deq_sc = fq_if_dequeue_sc,
251 .ps_req = fq_if_request,
252 .ps_stats = fq_if_getqstats,
253 .ps_allow_dequeue = fq_if_allow_dequeue,
254 };
255
256 void
pktsched_fq_init(void)257 pktsched_fq_init(void)
258 {
259 pktsched_ops_register(&fq_codel_classq_ops);
260 pktsched_ops_register(&new_fq_codel_classq_ops);
261
262 if (serverperfmode) {
263 fq_if_hash_table_size = (1 << 16);
264 } else {
265 fq_if_hash_table_size = (1 << 8);
266 }
267
268 // format looks like ifcq_drr_max=8,8,6
269 char buf[(FQ_IF_MAX_CLASSES) * 3];
270 size_t i, len, pri_index = 0;
271 uint32_t drr = 0;
272 if (!PE_parse_boot_arg_str("ifcq_drr_max", buf, sizeof(buf))) {
273 return;
274 }
275
276 len = strbuflen(buf, sizeof(buf));
277 for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
278 if (buf[i] != ',' && buf[i] != '\0') {
279 VERIFY(buf[i] >= '0' && buf[i] <= '9');
280 drr = drr * 10 + buf[i] - '0';
281 continue;
282 }
283 fq_codel_drr_max_values[pri_index] = drr;
284 pri_index += 1;
285 drr = 0;
286 }
287
288 #if DEBUG || DEVELOPMENT
289 PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum,
290 sizeof(fq_codel_quantum));
291 PE_parse_boot_argn("fq_if_def_c_target_qdelay", &fq_if_def_c_target_qdelay,
292 sizeof(fq_if_def_c_target_qdelay));
293 PE_parse_boot_argn("fq_if_def_c_update_interval",
294 &fq_if_def_c_update_interval, sizeof(fq_if_def_c_update_interval));
295 PE_parse_boot_argn("fq_if_def_l4s_target_qdelay", &fq_if_def_l4s_target_qdelay,
296 sizeof(fq_if_def_l4s_target_qdelay));
297 PE_parse_boot_argn("fq_if_def_l4s_update_interval",
298 &fq_if_def_l4s_update_interval, sizeof(fq_if_def_l4s_update_interval));
299 PE_parse_boot_argn("fq_if_ll_c_target_qdelay", &fq_if_ll_c_target_qdelay,
300 sizeof(fq_if_ll_c_target_qdelay));
301 PE_parse_boot_argn("fq_if_ll_c_update_interval",
302 &fq_if_ll_c_update_interval, sizeof(fq_if_ll_c_update_interval));
303 PE_parse_boot_argn("fq_if_ll_l4s_target_qdelay", &fq_if_ll_l4s_target_qdelay,
304 sizeof(fq_if_ll_l4s_target_qdelay));
305 PE_parse_boot_argn("fq_if_ll_l4s_update_interval",
306 &fq_if_ll_l4s_update_interval, sizeof(fq_if_ll_l4s_update_interval));
307 #endif /* DEBUG || DEVELOPMENT */
308
309 PE_parse_boot_argn("fq_codel_enable_pacing", &fq_codel_enable_pacing,
310 sizeof(fq_codel_enable_pacing));
311
312 fq_codel_init();
313 }
314
315 static uint32_t
fq_if_flow_hash_id(uint32_t flowid)316 fq_if_flow_hash_id(uint32_t flowid)
317 {
318 return flowid & (fq_if_hash_table_size - 1);
319 }
320
321 #define FQ_IF_CLASSQ_IDLE(_fcl_) \
322 (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
323 STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
324
325 typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
326 typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
327 int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
328 uint32_t *, boolean_t *, uint64_t);
329
330 static void
fq_if_append_mbuf(classq_pkt_t * pkt,classq_pkt_t * next_pkt)331 fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
332 {
333 pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
334 }
335
336 static inline uint64_t
fq_codel_get_time(void)337 fq_codel_get_time(void)
338 {
339 struct timespec ts;
340 uint64_t now;
341
342 nanouptime(&ts);
343 now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
344 return now;
345 }
346
347 #if SKYWALK
348 static void
fq_if_append_pkt(classq_pkt_t * pkt,classq_pkt_t * next_pkt)349 fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
350 {
351 pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
352 }
353 #endif /* SKYWALK */
354
355 #if SKYWALK
356 static boolean_t
fq_getq_flow_kpkt(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)357 fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
358 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
359 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
360 boolean_t *qempty, uint64_t now)
361 {
362 uint32_t plen;
363 pktsched_pkt_t pkt;
364 boolean_t limit_reached = FALSE;
365 struct ifclassq *ifq = fqs->fqs_ifq;
366 struct ifnet *ifp = ifq->ifcq_ifp;
367
368 /*
369 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
370 * all common flags need to be declared in that mask.
371 */
372 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
373 !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
374 _PKTSCHED_PKT_INIT(&pkt);
375 fqs->fqs_dequeue(fqs, fq, &pkt, now);
376 if (pkt.pktsched_pcnt == 0) {
377 continue;
378 }
379
380 ASSERT(pkt.pktsched_ptype == QP_PACKET);
381
382 plen = pktsched_get_pkt_len(&pkt);
383 fq->fq_deficit -= plen;
384 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
385 pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
386 fq->fq_flags &= ~FQF_FRESH_FLOW;
387 }
388
389 if (head->cp_kpkt == NULL) {
390 *head = pkt.pktsched_pkt;
391 } else {
392 ASSERT(tail->cp_kpkt != NULL);
393 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
394 tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
395 }
396 *tail = pkt.pktsched_pkt;
397 tail->cp_kpkt->pkt_nextpkt = NULL;
398 fq_cl->fcl_stat.fcl_dequeue++;
399 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
400 *pkt_cnt += 1;
401 *byte_cnt += plen;
402
403 ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
404
405 /* Check if the limit is reached */
406 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
407 limit_reached = TRUE;
408 }
409 }
410 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
411 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
412 fq->fq_bytes, fq->fq_min_qdelay);
413
414 *qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
415 return limit_reached;
416 }
417 #endif /* SKYWALK */
418
419 static boolean_t
fq_getq_flow_mbuf(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,int64_t byte_limit,uint32_t pkt_limit,classq_pkt_t * head,classq_pkt_t * tail,uint32_t * byte_cnt,uint32_t * pkt_cnt,boolean_t * qempty,uint64_t now)420 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
421 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
422 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
423 boolean_t *qempty, uint64_t now)
424 {
425 u_int32_t plen;
426 pktsched_pkt_t pkt;
427 boolean_t limit_reached = FALSE;
428 struct ifclassq *ifq = fqs->fqs_ifq;
429 struct ifnet *ifp = ifq->ifcq_ifp;
430
431 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
432 !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
433 _PKTSCHED_PKT_INIT(&pkt);
434 fqs->fqs_dequeue(fqs, fq, &pkt, now);
435 if (pkt.pktsched_pcnt == 0) {
436 continue;
437 }
438
439 ASSERT(pkt.pktsched_ptype == QP_MBUF);
440
441 plen = pktsched_get_pkt_len(&pkt);
442 fq->fq_deficit -= plen;
443
444 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
445 pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
446 fq->fq_flags &= ~FQF_FRESH_FLOW;
447 }
448
449 if (head->cp_mbuf == NULL) {
450 *head = pkt.pktsched_pkt;
451 } else {
452 ASSERT(tail->cp_mbuf != NULL);
453 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
454 tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
455 }
456 *tail = pkt.pktsched_pkt;
457 tail->cp_mbuf->m_nextpkt = NULL;
458 fq_cl->fcl_stat.fcl_dequeue++;
459 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
460 *pkt_cnt += 1;
461 *byte_cnt += plen;
462
463 ifclassq_set_packet_metadata(ifq, ifp, &pkt.pktsched_pkt);
464
465 /* Check if the limit is reached */
466 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
467 limit_reached = TRUE;
468 }
469 }
470 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
471 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
472 fq->fq_bytes, fq->fq_min_qdelay);
473
474 *qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
475 return limit_reached;
476 }
477
478 static void
fq_if_pacemaker_tcall(thread_call_param_t arg0,thread_call_param_t arg1)479 fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
480 {
481 #pragma unused(arg1)
482 struct ifnet* ifp = (struct ifnet*)arg0;
483 ASSERT(ifp != NULL);
484
485 ifnet_start_ignore_delay(ifp);
486 }
487
488 static void
fq_if_calc_target_qdelay(struct ifnet * ifp,uint64_t * if_target_qdelay,uint32_t flags)489 fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay,
490 uint32_t flags)
491 {
492 uint64_t qdelay = 0, qdelay_configed = 0, qdely_default = 0;
493 if (flags == IF_CLASSQ_DEF) {
494 qdelay = IFCQ_TARGET_QDELAY(ifp->if_snd);
495 }
496
497 switch (flags) {
498 case IF_CLASSQ_DEF:
499 qdelay_configed = fq_if_def_c_target_qdelay;
500 qdely_default = IFQ_DEF_C_TARGET_DELAY;
501 break;
502 case IF_CLASSQ_L4S:
503 qdelay_configed = fq_if_def_l4s_target_qdelay;
504 if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI ||
505 ifp->if_family == IFNET_FAMILY_CELLULAR) {
506 qdely_default = IFQ_DEF_L4S_WIRELESS_TARGET_DELAY;
507 } else {
508 qdely_default = IFQ_DEF_L4S_TARGET_DELAY;
509 }
510 break;
511 case IF_CLASSQ_LOW_LATENCY:
512 qdelay_configed = fq_if_ll_c_target_qdelay;
513 qdely_default = IFQ_LL_C_TARGET_DELAY;
514 break;
515 case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S):
516 qdelay_configed = fq_if_ll_l4s_target_qdelay;
517 if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI ||
518 ifp->if_family == IFNET_FAMILY_CELLULAR) {
519 qdely_default = IFQ_LL_L4S_WIRELESS_TARGET_DELAY;
520 } else {
521 qdely_default = IFQ_LL_L4S_TARGET_DELAY;
522 }
523 break;
524 default:
525 VERIFY(0);
526 /* NOTREACHED */
527 __builtin_unreachable();
528 }
529
530 if (qdelay_configed != 0) {
531 qdelay = qdelay_configed;
532 }
533
534 /*
535 * If we do not know the effective bandwidth, use the default
536 * target queue delay.
537 */
538 if (qdelay == 0) {
539 qdelay = qdely_default;
540 }
541
542 /*
543 * If a delay has been added to ifnet start callback for
544 * coalescing, we have to add that to the pre-set target delay
545 * because the packets can be in the queue longer.
546 */
547 if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
548 ifp->if_start_delay_timeout > 0) {
549 qdelay += ifp->if_start_delay_timeout;
550 }
551
552 *(if_target_qdelay) = qdelay;
553 }
554
555 static void
fq_if_calc_update_interval(uint64_t * update_interval,uint32_t flags)556 fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags)
557 {
558 uint64_t interval = 0, interval_configed = 0, interval_default = 0;
559
560 switch (flags) {
561 case IF_CLASSQ_DEF:
562 interval_configed = fq_if_def_c_update_interval;
563 interval_default = IFQ_DEF_C_UPDATE_INTERVAL;
564 break;
565 case IF_CLASSQ_L4S:
566 interval_configed = fq_if_def_l4s_update_interval;
567 interval_default = IFQ_DEF_L4S_UPDATE_INTERVAL;
568 break;
569 case IF_CLASSQ_LOW_LATENCY:
570 interval_configed = fq_if_ll_c_update_interval;
571 interval_default = IFQ_LL_C_UPDATE_INTERVAL;
572 break;
573 case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S):
574 interval_configed = fq_if_ll_l4s_update_interval;
575 interval_default = IFQ_LL_L4S_UPDATE_INTERVAL;
576 break;
577 default:
578 VERIFY(0);
579 /* NOTREACHED */
580 __builtin_unreachable();
581 }
582
583 /* If the system level override is set, use it */
584 if (interval_configed != 0) {
585 interval = interval_configed;
586 }
587
588 /* Otherwise use the default value */
589 if (interval == 0) {
590 interval = interval_default;
591 }
592
593 *update_interval = interval;
594 }
595
596 fq_if_t *
fq_if_alloc(struct ifclassq * ifq,classq_pkt_type_t ptype)597 fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
598 {
599 flowq_list_t *fqs_flows;
600 fq_if_t *fqs;
601
602 ASSERT(ifq->ifcq_ifp != NULL);
603 fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
604 if (fqs == NULL) {
605 return NULL;
606 }
607 fqs_flows = kalloc_type(flowq_list_t, fq_if_hash_table_size, Z_WAITOK | Z_ZERO);
608 if (fqs_flows == NULL) {
609 zfree(fq_if_zone, fqs);
610 return NULL;
611 }
612 fqs->fqs_flows = fqs_flows;
613 fqs->fqs_flows_count = fq_if_hash_table_size;
614 fqs->fqs_ifq = ifq;
615 fqs->fqs_ptype = ptype;
616
617 /* Configure packet drop limit across all queues */
618 fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
619 STAILQ_INIT(&fqs->fqs_fclist);
620 TAILQ_INIT(&fqs->fqs_empty_list);
621 TAILQ_INIT(&fqs->fqs_combined_grp_list);
622 fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(fq_if_pacemaker_tcall,
623 (thread_call_param_t)(ifq->ifcq_ifp), THREAD_CALL_PRIORITY_KERNEL,
624 THREAD_CALL_OPTIONS_ONCE);
625 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
626
627 return fqs;
628 }
629
630 void
fq_if_destroy(fq_if_t * fqs)631 fq_if_destroy(fq_if_t *fqs)
632 {
633 struct ifnet *ifp = fqs->fqs_ifq->ifcq_ifp;
634 thread_call_t __single tcall = fqs->fqs_pacemaker_tcall;
635
636 VERIFY(ifp != NULL);
637 ASSERT(tcall != NULL);
638 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
639 LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
640 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
641
642 /*
643 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
644 * and schedule a pacemaker call. So we do not need a sleep wait loop here
645 * cancel wait and free should succeed in one call.
646 */
647 thread_call_cancel_wait(tcall);
648 ASSERT(thread_call_free(tcall));
649
650 fq_if_purge(fqs);
651 fq_if_destroy_grps(fqs);
652
653 fqs->fqs_ifq = NULL;
654
655 #if (DEBUG || DEVELOPMENT)
656 struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid;
657 skoid_destroy(fqs_skoid);
658 #endif /* (DEBUG || DEVELOPMENT) */
659
660 kfree_type_counted_by(flowq_list_t, fqs->fqs_flows_count, fqs->fqs_flows);
661 zfree(fq_if_zone, fqs);
662 }
663
664 static inline uint8_t
fq_if_service_to_priority(fq_if_t * fqs,mbuf_svc_class_t svc)665 fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
666 {
667 uint8_t pri;
668
669 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
670 switch (svc) {
671 case MBUF_SC_BK_SYS:
672 case MBUF_SC_BK:
673 pri = FQ_IF_BK_INDEX;
674 break;
675 case MBUF_SC_BE:
676 case MBUF_SC_RD:
677 case MBUF_SC_OAM:
678 pri = FQ_IF_BE_INDEX;
679 break;
680 case MBUF_SC_AV:
681 case MBUF_SC_RV:
682 case MBUF_SC_VI:
683 case MBUF_SC_SIG:
684 pri = FQ_IF_VI_INDEX;
685 break;
686 case MBUF_SC_VO:
687 case MBUF_SC_CTL:
688 pri = FQ_IF_VO_INDEX;
689 break;
690 default:
691 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
692 break;
693 }
694 return pri;
695 }
696
697 /* scheduler is not managed by the driver */
698 switch (svc) {
699 case MBUF_SC_BK_SYS:
700 pri = FQ_IF_BK_SYS_INDEX;
701 break;
702 case MBUF_SC_BK:
703 pri = FQ_IF_BK_INDEX;
704 break;
705 case MBUF_SC_BE:
706 pri = FQ_IF_BE_INDEX;
707 break;
708 case MBUF_SC_RD:
709 pri = FQ_IF_RD_INDEX;
710 break;
711 case MBUF_SC_OAM:
712 pri = FQ_IF_OAM_INDEX;
713 break;
714 case MBUF_SC_AV:
715 pri = FQ_IF_AV_INDEX;
716 break;
717 case MBUF_SC_RV:
718 pri = FQ_IF_RV_INDEX;
719 break;
720 case MBUF_SC_VI:
721 pri = FQ_IF_VI_INDEX;
722 break;
723 case MBUF_SC_SIG:
724 pri = FQ_IF_SIG_INDEX;
725 break;
726 case MBUF_SC_VO:
727 pri = FQ_IF_VO_INDEX;
728 break;
729 case MBUF_SC_CTL:
730 pri = FQ_IF_CTL_INDEX;
731 break;
732 default:
733 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
734 break;
735 }
736 return pri;
737 }
738
739 void
fq_if_classq_init(fq_if_group_t * fqg,uint32_t pri,uint32_t quantum,uint32_t drr_max,uint32_t svc_class)740 fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
741 uint32_t drr_max, uint32_t svc_class)
742 {
743 fq_if_classq_t *fq_cl;
744 VERIFY(pri < FQ_IF_MAX_CLASSES);
745 fq_cl = &fqg->fqg_classq[pri];
746
747 VERIFY(fq_cl->fcl_quantum == 0);
748 VERIFY(quantum != 0);
749 fq_cl->fcl_quantum = quantum;
750 fq_cl->fcl_pri = pri;
751 fq_cl->fcl_drr_max = drr_max;
752 fq_cl->fcl_service_class = svc_class;
753 fq_cl->fcl_next_tx_time = 0;
754 fq_cl->fcl_flags = 0;
755 STAILQ_INIT(&fq_cl->fcl_new_flows);
756 STAILQ_INIT(&fq_cl->fcl_old_flows);
757 }
758
759 int
fq_if_enqueue(struct ifclassq * ifq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t * pdrop)760 fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *head,
761 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
762 {
763 uint8_t pri, grp_idx = 0;
764 fq_if_t *fqs;
765 fq_if_classq_t *fq_cl;
766 fq_if_group_t *fq_group;
767 int ret;
768 mbuf_svc_class_t svc;
769 pktsched_pkt_t pkt;
770
771 pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
772
773 IFCQ_LOCK_SPIN(ifq);
774 fqs = (fq_if_t *)ifq->ifcq_disc;
775 svc = pktsched_get_pkt_svc(&pkt);
776 #if SKYWALK
777 if (head->cp_ptype == QP_PACKET) {
778 grp_idx = head->cp_kpkt->pkt_qset_idx;
779 }
780 #endif /* SKYWALK */
781 pri = fq_if_service_to_priority(fqs, svc);
782 VERIFY(pri < FQ_IF_MAX_CLASSES);
783
784 fq_group = fq_if_find_grp(fqs, grp_idx);
785 fq_cl = &fq_group->fqg_classq[pri];
786
787 if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
788 IFCQ_UNLOCK(ifq);
789 /* BK_SYS is currently throttled */
790 os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
791 if (__improbable(droptap_verbose > 0)) {
792 pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED,
793 __func__, __LINE__, 0);
794 } else {
795 pktsched_free_pkt(&pkt);
796 }
797 *pdrop = TRUE;
798 ret = EQSUSPENDED;
799 goto done;
800 }
801
802 ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
803 ret = fqs->fqs_enqueue(fqs, fq_group, &pkt, fq_cl);
804 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
805 if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
806 (1 << pri)) == 0) {
807 /*
808 * this group is not in ER or EB groups,
809 * mark it as IB
810 */
811 pktsched_bit_set(pri, &fq_group->fqg_bitmaps[FQ_IF_IB]);
812 }
813 }
814
815 if (__improbable(ret != 0)) {
816 if (ret == CLASSQEQ_SUCCESS_FC) {
817 /* packet enqueued, return advisory feedback */
818 ret = EQFULL;
819 *pdrop = FALSE;
820 } else if (ret == CLASSQEQ_COMPRESSED) {
821 ret = 0;
822 *pdrop = FALSE;
823 } else if (ret == CLASSQEQ_CONGESTED) {
824 ret = EQCONGESTED;
825 *pdrop = FALSE;
826 } else {
827 IFCQ_UNLOCK(ifq);
828 *pdrop = TRUE;
829 pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_FULL,
830 __func__, __LINE__, 0);
831 switch (ret) {
832 case CLASSQEQ_DROP:
833 ret = ENOBUFS;
834 goto done;
835 case CLASSQEQ_DROP_FC:
836 ret = EQFULL;
837 goto done;
838 case CLASSQEQ_DROP_SP:
839 ret = EQSUSPENDED;
840 goto done;
841 default:
842 VERIFY(0);
843 /* NOTREACHED */
844 __builtin_unreachable();
845 }
846 /* NOTREACHED */
847 __builtin_unreachable();
848 }
849 } else {
850 *pdrop = FALSE;
851 }
852 IFCQ_ADD_LEN(ifq, cnt);
853 IFCQ_INC_BYTES(ifq, bytes);
854
855
856 FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
857 FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
858
859 IFCQ_UNLOCK(ifq);
860 done:
861 #if DEBUG || DEVELOPMENT
862 if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
863 ret = 0;
864 }
865 #endif /* DEBUG || DEVELOPMENT */
866 return ret;
867 }
868
869 static inline void
fq_dqlist_add(flowq_dqlist_t * fq_dqlist_head,fq_t * fq)870 fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
871 {
872 ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
873 ASSERT(!fq->fq_in_dqlist);
874 STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
875 fq->fq_in_dqlist = true;
876 }
877
878 static inline void
fq_dqlist_remove(flowq_dqlist_t * fq_dqlist_head,fq_t * fq,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)879 fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
880 classq_pkt_t *tail, classq_pkt_type_t ptype)
881 {
882 ASSERT(fq->fq_in_dqlist);
883 if (fq->fq_dq_head.cp_mbuf == NULL) {
884 goto done;
885 }
886
887 if (head->cp_mbuf == NULL) {
888 *head = fq->fq_dq_head;
889 } else {
890 ASSERT(tail->cp_mbuf != NULL);
891
892 switch (ptype) {
893 case QP_MBUF:
894 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
895 tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
896 ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
897 break;
898 #if SKYWALK
899 case QP_PACKET:
900 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
901 tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
902 ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
903 break;
904 #endif /* SKYWALK */
905 default:
906 VERIFY(0);
907 /* NOTREACHED */
908 __builtin_unreachable();
909 }
910 }
911 *tail = fq->fq_dq_tail;
912 done:
913 STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
914 CLASSQ_PKT_INIT(&fq->fq_dq_head);
915 CLASSQ_PKT_INIT(&fq->fq_dq_tail);
916 fq->fq_in_dqlist = false;
917 }
918
919 static inline void
fq_dqlist_get_packet_list(flowq_dqlist_t * fq_dqlist_head,classq_pkt_t * head,classq_pkt_t * tail,classq_pkt_type_t ptype)920 fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
921 classq_pkt_t *tail, classq_pkt_type_t ptype)
922 {
923 fq_t *fq, *tfq;
924
925 STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
926 fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
927 }
928 }
929
930 static int
fq_if_grps_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)931 fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
932 fq_if_group_t **selected_grp)
933 {
934 #pragma unused(pri)
935
936 fq_if_group_t *grp;
937 uint32_t highest_pri = FQ_IF_MAX_CLASSES;
938 int ret_pri = 0;
939
940 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
941 uint32_t cur_pri = pktsched_ffs(grp->fqg_bitmaps[state]);
942 /* bitmap is empty in this case */
943 if (cur_pri == 0) {
944 continue;
945 }
946 if (cur_pri <= highest_pri) {
947 highest_pri = cur_pri;
948 ret_pri = cur_pri;
949 *selected_grp = grp;
950 }
951 }
952 return ret_pri;
953 }
954
955 static boolean_t
fq_if_grps_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)956 fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
957 {
958 #pragma unused(pri)
959
960 fq_if_group_t *grp;
961
962 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
963 if (grp->fqg_bitmaps[state] != 0) {
964 return FALSE;
965 }
966 }
967 return TRUE;
968 }
969
970 static void
fq_if_grps_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)971 fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
972 fq_if_state src_state)
973 {
974 #pragma unused(pri)
975
976 fq_if_group_t *grp;
977 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
978 grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
979 }
980 }
981
982 static void
fq_if_grps_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)983 fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
984 {
985 #pragma unused(pri)
986
987 fq_if_group_t *grp;
988 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
989 grp->fqg_bitmaps[state] = 0;
990 }
991 }
992
993 static void
fq_if_grps_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)994 fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
995 fq_if_state src_state)
996 {
997 #pragma unused(pri)
998
999 fq_if_group_t *grp;
1000 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1001 grp->fqg_bitmaps[dst_state] =
1002 grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
1003 grp->fqg_bitmaps[src_state] = 0;
1004 }
1005 }
1006
1007 static int
fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t * grp_list,int pri,fq_if_state state,fq_if_group_t ** selected_grp)1008 fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
1009 fq_if_group_t **selected_grp)
1010 {
1011 fq_if_group_t *grp;
1012 int ret_pri = 0;
1013
1014 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1015 if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
1016 /* +1 to match the semantics of pktsched_ffs */
1017 ret_pri = pri + 1;
1018 *selected_grp = grp;
1019 break;
1020 }
1021 }
1022
1023 return ret_pri;
1024 }
1025
1026 static boolean_t
fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)1027 fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
1028 {
1029 fq_if_group_t *grp;
1030
1031 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1032 if (pktsched_bit_tst(pri, &grp->fqg_bitmaps[state])) {
1033 return FALSE;
1034 }
1035 }
1036 return TRUE;
1037 }
1038
1039 static void
fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)1040 fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
1041 fq_if_state src_state)
1042 {
1043 fq_if_group_t *grp;
1044
1045 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1046 pktsched_bit_cpy(pri, &grp->fqg_bitmaps[dst_state],
1047 &grp->fqg_bitmaps[src_state]);
1048 }
1049 }
1050
1051 static void
fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t * grp_list,int pri,fq_if_state state)1052 fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
1053 {
1054 fq_if_group_t *grp;
1055
1056 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1057 pktsched_bit_clr(pri, &grp->fqg_bitmaps[state]);
1058 }
1059 }
1060
1061 static void
fq_if_grps_sc_bitmap_move(fq_grp_tailq_t * grp_list,int pri,fq_if_state dst_state,fq_if_state src_state)1062 fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
1063 fq_if_state src_state)
1064 {
1065 fq_if_group_t *grp;
1066
1067 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
1068 pktsched_bit_move(pri, &grp->fqg_bitmaps[dst_state],
1069 &grp->fqg_bitmaps[src_state]);
1070 pktsched_bit_clr(pri, &grp->fqg_bitmaps[src_state]);
1071 }
1072 }
1073
1074 /*
1075 * Pacemaker is only scheduled when no packet can be dequeued from AQM
1076 * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
1077 * This only applies to L4S traffic at this moment.
1078 */
1079 static void
fq_if_schedule_pacemaker(fq_if_t * fqs,uint64_t now,uint64_t next_tx_time)1080 fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
1081 {
1082 uint64_t deadline = 0;
1083 if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1084 return;
1085 }
1086 ASSERT(next_tx_time != FQ_INVALID_TX_TS);
1087 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
1088 ASSERT(now < next_tx_time);
1089
1090 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
1091 uint64_t, next_tx_time - now);
1092 KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
1093 next_tx_time, next_tx_time - now);
1094
1095 clock_interval_to_deadline((uint32_t)(next_tx_time - now), 1, &deadline);
1096 thread_call_enter_delayed(fqs->fqs_pacemaker_tcall, deadline);
1097 }
1098
1099 static int
fq_if_dequeue_common(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1100 fq_if_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
1101 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1102 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1103 uint8_t grp_idx)
1104 {
1105 uint32_t total_pktcnt = 0, total_bytecnt = 0;
1106 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1107 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1108 classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
1109 fq_if_append_pkt_t append_pkt;
1110 flowq_dqlist_t fq_dqlist_head;
1111 fq_if_classq_t *fq_cl;
1112 fq_grp_tailq_t *grp_list, tmp_grp_list;
1113 fq_if_group_t *__single fq_grp = NULL;
1114 fq_if_t *fqs;
1115 uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
1116 int pri = 0, svc_pri = 0;
1117 bool all_paced = true;
1118
1119 IFCQ_LOCK_ASSERT_HELD(ifq);
1120
1121 fqs = (fq_if_t *)ifq->ifcq_disc;
1122 STAILQ_INIT(&fq_dqlist_head);
1123
1124 switch (fqs->fqs_ptype) {
1125 case QP_MBUF:
1126 append_pkt = fq_if_append_mbuf;
1127 break;
1128
1129 #if SKYWALK
1130 case QP_PACKET:
1131 append_pkt = fq_if_append_pkt;
1132 break;
1133 #endif /* SKYWALK */
1134
1135 default:
1136 VERIFY(0);
1137 /* NOTREACHED */
1138 __builtin_unreachable();
1139 }
1140
1141 now = fq_codel_get_time();
1142 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
1143 svc_pri = fq_if_service_to_priority(fqs, svc);
1144 } else {
1145 VERIFY(svc == MBUF_SC_UNSPEC);
1146 }
1147
1148 if (fq_if_is_grp_combined(fqs, grp_idx)) {
1149 grp_list = &fqs->fqs_combined_grp_list;
1150 VERIFY(!TAILQ_EMPTY(grp_list));
1151 } else {
1152 grp_list = &tmp_grp_list;
1153 fq_grp = fq_if_find_grp(fqs, grp_idx);
1154 TAILQ_INIT(grp_list);
1155 TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
1156 }
1157
1158 for (;;) {
1159 uint32_t pktcnt = 0, bytecnt = 0;
1160 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1161 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1162 bool fq_cl_all_paced = false;
1163 uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
1164
1165 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
1166 fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
1167 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
1168 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
1169 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
1170 if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1171 /*
1172 * Move fq_cl in IR back to ER, so that they will inspected with priority
1173 * the next time the driver dequeues
1174 */
1175 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1176 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
1177 }
1178 break;
1179 }
1180 }
1181 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
1182 if (pri == 0) {
1183 /*
1184 * There are no ER flows, move the highest
1185 * priority one from EB if there are any in that
1186 * category
1187 */
1188 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
1189 VERIFY(pri > 0);
1190 VERIFY(fq_grp != NULL);
1191 pktsched_bit_clr((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1192 pktsched_bit_set((pri - 1), &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1193 }
1194 VERIFY(fq_grp != NULL);
1195 pri--; /* index starts at 0 */
1196 fq_cl = &fq_grp->fqg_classq[pri];
1197
1198 if (fq_cl->fcl_budget <= 0) {
1199 /* Update the budget */
1200 fq_cl->fcl_budget += (min(fq_cl->fcl_drr_max,
1201 fq_cl->fcl_stat.fcl_flows_cnt) *
1202 fq_cl->fcl_quantum);
1203 if (fq_cl->fcl_budget <= 0) {
1204 goto state_change;
1205 }
1206 }
1207 fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1208 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1209 &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
1210 &fq_cl_next_tx_time);
1211 if (head.cp_mbuf != NULL) {
1212 ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
1213 if (first.cp_mbuf == NULL) {
1214 first = head;
1215 } else {
1216 ASSERT(last.cp_mbuf != NULL);
1217 append_pkt(&last, &head);
1218 }
1219 last = tail;
1220 append_pkt(&last, &tmp);
1221 }
1222 if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
1223 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1224 next_tx_time = fq_cl_next_tx_time;
1225 }
1226 fq_cl->fcl_budget -= bytecnt;
1227 total_pktcnt += pktcnt;
1228 total_bytecnt += bytecnt;
1229
1230 /*
1231 * If the class has exceeded the budget but still has data
1232 * to send, move it to IB
1233 */
1234 state_change:
1235 VERIFY(fq_grp != NULL);
1236 all_paced &= fq_cl_all_paced;
1237 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1238 if (fq_cl->fcl_budget <= 0) {
1239 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1240 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1241 } else if (fq_cl_all_paced) {
1242 if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1243 /*
1244 * If a fq_cl still has budget but only paced queues, park it
1245 * to IR so that we will not keep loopping over it
1246 */
1247 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IR]);
1248 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1249 }
1250 }
1251 } else {
1252 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1253 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1254 fq_grp->fqg_bitmaps[FQ_IF_EB] |
1255 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1256 fq_cl->fcl_budget = 0;
1257 }
1258 if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1259 if (fq_codel_enable_pacing && fq_codel_enable_l4s) {
1260 /*
1261 * Move fq_cl in IR back to ER, so that they will inspected with priority
1262 * the next time the driver dequeues
1263 */
1264 fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1265 }
1266 break;
1267 }
1268 }
1269
1270 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1271 TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1272 VERIFY(TAILQ_EMPTY(grp_list));
1273 }
1274
1275 fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last,
1276 fqs->fqs_ptype);
1277
1278 if (__probable(first_packet != NULL)) {
1279 *first_packet = first;
1280 }
1281 if (last_packet != NULL) {
1282 *last_packet = last;
1283 }
1284 if (retpktcnt != NULL) {
1285 *retpktcnt = total_pktcnt;
1286 }
1287 if (retbytecnt != NULL) {
1288 *retbytecnt = total_bytecnt;
1289 }
1290 if (next_tx_time != FQ_INVALID_TX_TS) {
1291 ASSERT(next_tx_time > now);
1292 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1293 }
1294
1295 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1296 fq_if_purge_empty_flow_list(fqs, now, false);
1297 return 0;
1298 }
1299
1300 int
fq_if_dequeue(struct ifclassq * ifq,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1301 fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt,
1302 u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1303 classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1304 u_int32_t *retbytecnt, uint8_t grp_idx)
1305 {
1306 return fq_if_dequeue_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1307 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1308 }
1309
1310 int
fq_if_dequeue_sc(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1311 fq_if_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t svc,
1312 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1313 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1314 uint8_t grp_idx)
1315 {
1316 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1317
1318 if (fq_if_is_grp_combined(fqs, grp_idx)) {
1319 return fq_if_dequeue_common(ifq, svc, maxpktcnt, maxbytecnt,
1320 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1321 } else {
1322 /*
1323 * take a shortcut here since there is no need to schedule
1324 * one single service class.
1325 */
1326 return fq_if_dequeue_sc_separate(ifq, svc, maxpktcnt, maxbytecnt,
1327 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1328 }
1329 }
1330
1331 static int
fq_if_dequeue_sc_separate(struct ifclassq * ifq,mbuf_svc_class_t svc,u_int32_t maxpktcnt,u_int32_t maxbytecnt,classq_pkt_t * first_packet,classq_pkt_t * last_packet,u_int32_t * retpktcnt,u_int32_t * retbytecnt,uint8_t grp_idx)1332 fq_if_dequeue_sc_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1333 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1334 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1335 uint8_t grp_idx)
1336 {
1337 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1338 uint8_t pri;
1339 u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1340 fq_if_classq_t *fq_cl;
1341 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1342 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1343 fq_if_append_pkt_t append_pkt;
1344 flowq_dqlist_t fq_dqlist_head;
1345 fq_if_group_t *fq_grp;
1346 uint64_t now;
1347
1348 switch (fqs->fqs_ptype) {
1349 case QP_MBUF:
1350 append_pkt = fq_if_append_mbuf;
1351 break;
1352
1353 #if SKYWALK
1354 case QP_PACKET:
1355 append_pkt = fq_if_append_pkt;
1356 break;
1357 #endif /* SKYWALK */
1358
1359 default:
1360 VERIFY(0);
1361 /* NOTREACHED */
1362 __builtin_unreachable();
1363 }
1364
1365 STAILQ_INIT(&fq_dqlist_head);
1366 now = fq_codel_get_time();
1367
1368 pri = fq_if_service_to_priority(fqs, svc);
1369 fq_grp = fq_if_find_grp(fqs, grp_idx);
1370 fq_cl = &fq_grp->fqg_classq[pri];
1371
1372 /*
1373 * Now we have the queue for a particular service class. We need
1374 * to dequeue as many packets as needed, first from the new flows
1375 * and then from the old flows.
1376 */
1377 while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1378 fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1379 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1380 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1381 u_int32_t pktcnt = 0, bytecnt = 0;
1382 bool all_paced = false;
1383 uint64_t next_tx_time = FQ_INVALID_TX_TS;
1384
1385 fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1386 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1387 &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1388 if (head.cp_mbuf != NULL) {
1389 if (first.cp_mbuf == NULL) {
1390 first = head;
1391 } else {
1392 ASSERT(last.cp_mbuf != NULL);
1393 append_pkt(&last, &head);
1394 }
1395 last = tail;
1396 }
1397 total_pktcnt += pktcnt;
1398 total_bytecnt += bytecnt;
1399
1400 if (next_tx_time != FQ_INVALID_TX_TS) {
1401 ASSERT(next_tx_time > now);
1402 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1403 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1404 break;
1405 }
1406 }
1407
1408 /*
1409 * Mark classq as IB if it's not idle, so that we can
1410 * start without re-init the bitmaps when it's switched
1411 * to combined mode.
1412 */
1413 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1414 pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1415 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1416 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1417 } else {
1418 pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1419 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1420 fq_grp->fqg_bitmaps[FQ_IF_EB] |
1421 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1422 }
1423
1424 fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last, fqs->fqs_ptype);
1425
1426 if (__probable(first_packet != NULL)) {
1427 *first_packet = first;
1428 }
1429 if (last_packet != NULL) {
1430 *last_packet = last;
1431 }
1432 if (retpktcnt != NULL) {
1433 *retpktcnt = total_pktcnt;
1434 }
1435 if (retbytecnt != NULL) {
1436 *retbytecnt = total_bytecnt;
1437 }
1438
1439 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1440 fq_if_purge_empty_flow_list(fqs, now, false);
1441 return 0;
1442 }
1443
1444 static void
fq_if_purge_flow(fq_if_t * fqs,fq_t * fq,uint32_t * pktsp,uint32_t * bytesp,uint64_t now)1445 fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1446 uint32_t *bytesp, uint64_t now)
1447 {
1448 fq_if_classq_t *fq_cl;
1449 u_int32_t pkts, bytes;
1450 pktsched_pkt_t pkt;
1451 fq_if_group_t *grp;
1452
1453 fq_cl = &FQ_CLASSQ(fq);
1454 grp = FQ_GROUP(fq);
1455 pkts = bytes = 0;
1456 _PKTSCHED_PKT_INIT(&pkt);
1457 for (;;) {
1458 fqs->fqs_dequeue(fqs, fq, &pkt, now);
1459 if (pkt.pktsched_pkt_mbuf == NULL) {
1460 VERIFY(pkt.pktsched_ptype == QP_INVALID);
1461 break;
1462 }
1463 pkts++;
1464 bytes += pktsched_get_pkt_len(&pkt);
1465 if (__improbable(droptap_verbose > 0)) {
1466 pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_PURGE_FLOW,
1467 __func__, __LINE__, 0);
1468 } else {
1469 pktsched_free_pkt(&pkt);
1470 }
1471 }
1472 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1473 AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1474
1475 IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1476
1477 /* move through the flow queue states */
1478 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1479 if (fq->fq_flags & FQF_NEW_FLOW) {
1480 fq_if_empty_new_flow(fq, fq_cl);
1481 }
1482 if (fq->fq_flags & FQF_OLD_FLOW) {
1483 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1484 }
1485 if (fq->fq_flags & FQF_EMPTY_FLOW) {
1486 fq_if_purge_empty_flow(fqs, fq);
1487 fq = NULL;
1488 }
1489
1490 if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1491 int i;
1492 for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1493 pktsched_bit_clr(fq_cl->fcl_pri, &grp->fqg_bitmaps[i]);
1494 }
1495 }
1496
1497 if (pktsp != NULL) {
1498 *pktsp = pkts;
1499 }
1500 if (bytesp != NULL) {
1501 *bytesp = bytes;
1502 }
1503 }
1504
1505 static void
fq_if_purge_classq(fq_if_t * fqs,fq_if_classq_t * fq_cl)1506 fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1507 {
1508 fq_t *fq, *tfq;
1509 uint64_t now;
1510
1511 now = fq_codel_get_time();
1512 /*
1513 * Take each flow from new/old flow list and flush mbufs
1514 * in that flow
1515 */
1516 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1517 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1518 }
1519 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1520 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1521 }
1522 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1523 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1524
1525 STAILQ_INIT(&fq_cl->fcl_new_flows);
1526 STAILQ_INIT(&fq_cl->fcl_old_flows);
1527 fq_cl->fcl_budget = 0;
1528 }
1529
1530 static void
fq_if_purge(fq_if_t * fqs)1531 fq_if_purge(fq_if_t *fqs)
1532 {
1533 uint64_t now;
1534 fq_if_group_t *grp;
1535 int i;
1536
1537 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1538 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1539 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1540 continue;
1541 }
1542
1543 grp = fq_if_find_grp(fqs, grp_idx);
1544 fq_if_purge_grp(fqs, grp);
1545 }
1546
1547 now = fq_codel_get_time();
1548 fq_if_purge_empty_flow_list(fqs, now, true);
1549
1550 VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1551 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1552
1553 fqs->fqs_large_flow = NULL;
1554 for (i = 0; i < fqs->fqs_flows_count; i++) {
1555 VERIFY(LIST_EMPTY(&fqs->fqs_flows[i]));
1556 }
1557
1558 IFCQ_LEN(fqs->fqs_ifq) = 0;
1559 IFCQ_BYTES(fqs->fqs_ifq) = 0;
1560 }
1561
1562 static void
fq_if_purge_sc(fq_if_t * fqs,cqrq_purge_sc_t * req)1563 fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1564 {
1565 fq_t *fq;
1566 uint64_t now;
1567 fq_if_group_t *grp;
1568
1569 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1570 req->packets = req->bytes = 0;
1571 VERIFY(req->flow != 0);
1572
1573 now = fq_codel_get_time();
1574
1575 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1576 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1577 continue;
1578 }
1579 uint32_t bytes = 0, pkts = 0;
1580
1581 grp = fq_if_find_grp(fqs, grp_idx);
1582 /*
1583 * Packet and traffic type are needed only if we want
1584 * to create a flow queue.
1585 */
1586 fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, 0, 0, FQ_TFC_C, false);
1587 if (fq != NULL) {
1588 fq_if_purge_flow(fqs, fq, &pkts, &bytes, now);
1589 req->bytes += bytes;
1590 req->packets += pkts;
1591 }
1592 }
1593 }
1594
1595 static uint32_t
fq_if_calc_quantum(struct ifnet * ifp)1596 fq_if_calc_quantum(struct ifnet *ifp)
1597 {
1598 uint32_t quantum, hwassist_flags;
1599
1600 switch (ifp->if_family) {
1601 case IFNET_FAMILY_ETHERNET:
1602 VERIFY(ifp->if_mtu <= IF_MAXMTU);
1603 quantum = ifp->if_mtu + ETHER_HDR_LEN;
1604 break;
1605
1606 case IFNET_FAMILY_CELLULAR:
1607 case IFNET_FAMILY_IPSEC:
1608 case IFNET_FAMILY_UTUN:
1609 VERIFY(ifp->if_mtu <= UINT16_MAX);
1610 quantum = ifp->if_mtu;
1611 break;
1612
1613 default:
1614 quantum = FQ_CODEL_DEFAULT_QUANTUM;
1615 break;
1616 }
1617
1618 hwassist_flags = if_get_driver_hwassist(ifp);
1619 if ((hwassist_flags & IFNET_TSOF) != 0) {
1620 VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1621 VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1622 quantum = MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1623 quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1624 }
1625
1626 quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1627 #if DEBUG || DEVELOPMENT
1628 quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1629 #endif /* DEBUG || DEVELOPMENT */
1630 VERIFY(quantum != 0);
1631 return quantum;
1632 }
1633
1634 static void
fq_if_mtu_update(fq_if_t * fqs)1635 fq_if_mtu_update(fq_if_t *fqs)
1636 {
1637 #define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q) \
1638 (_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \
1639 FQ_CODEL_QUANTUM_ ## _s(_q) \
1640
1641 uint32_t quantum;
1642 fq_if_group_t *grp;
1643
1644 quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
1645
1646 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1647 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1648 continue;
1649 }
1650
1651 grp = fq_if_find_grp(fqs, grp_idx);
1652
1653 if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1654 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1655 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1656 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1657 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1658 } else {
1659 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1660 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1661 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1662 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1663 _FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1664 _FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1665 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1666 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1667 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1668 _FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1669 }
1670 }
1671 #undef _FQ_CLASSQ_UPDATE_QUANTUM
1672 }
1673
1674 static void
fq_if_event(fq_if_t * fqs,cqev_t ev)1675 fq_if_event(fq_if_t *fqs, cqev_t ev)
1676 {
1677 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1678
1679 switch (ev) {
1680 case CLASSQ_EV_LINK_UP:
1681 case CLASSQ_EV_LINK_DOWN:
1682 fq_if_purge(fqs);
1683 break;
1684 case CLASSQ_EV_LINK_MTU:
1685 fq_if_mtu_update(fqs);
1686 break;
1687 default:
1688 break;
1689 }
1690 }
1691
1692 static void
fq_if_classq_suspend(fq_if_t * fqs,fq_if_classq_t * fq_cl)1693 fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1694 {
1695 fq_if_purge_classq(fqs, fq_cl);
1696 fqs->fqs_throttle = 1;
1697 fq_cl->fcl_stat.fcl_throttle_on++;
1698 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1699 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1700 }
1701
1702 static void
fq_if_classq_resume(fq_if_t * fqs,fq_if_classq_t * fq_cl)1703 fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1704 {
1705 VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1706 fqs->fqs_throttle = 0;
1707 fq_cl->fcl_stat.fcl_throttle_off++;
1708 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1709 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1710 }
1711
1712
1713 static int
fq_if_throttle(fq_if_t * fqs,cqrq_throttle_t * tr)1714 fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1715 {
1716 struct ifclassq *ifq = fqs->fqs_ifq;
1717 uint8_t index;
1718 fq_if_group_t *grp;
1719
1720 #if !MACH_ASSERT
1721 #pragma unused(ifq)
1722 #endif
1723 IFCQ_LOCK_ASSERT_HELD(ifq);
1724
1725 if (!tr->set) {
1726 tr->level = fqs->fqs_throttle;
1727 return 0;
1728 }
1729
1730 if (tr->level == fqs->fqs_throttle) {
1731 return EALREADY;
1732 }
1733
1734 /* Throttling is allowed on BK_SYS class only */
1735 index = fq_if_service_to_priority(fqs, MBUF_SC_BK_SYS);
1736
1737 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1738 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1739 continue;
1740 }
1741 grp = fq_if_find_grp(fqs, grp_idx);
1742 switch (tr->level) {
1743 case IFNET_THROTTLE_OFF:
1744 fq_if_classq_resume(fqs, &grp->fqg_classq[index]);
1745 break;
1746 case IFNET_THROTTLE_OPPORTUNISTIC:
1747 fq_if_classq_suspend(fqs, &grp->fqg_classq[index]);
1748 break;
1749 default:
1750 break;
1751 }
1752 }
1753 return 0;
1754 }
1755
1756 static inline boolean_t
fq_if_is_fq_cl_paced(fq_if_classq_t * fq_cl,uint64_t now)1757 fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1758 {
1759 if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1760 return true;
1761 }
1762
1763 fq_cl->fcl_flags &= ~FCL_PACED;
1764 fq_cl->fcl_next_tx_time = 0;
1765 return false;
1766 }
1767
1768 static void
fq_if_grp_stat_sc(fq_if_t * fqs,fq_if_group_t * grp,cqrq_stat_sc_t * stat,uint64_t now)1769 fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1770 {
1771 uint8_t pri;
1772 fq_if_classq_t *fq_cl;
1773
1774 ASSERT(stat != NULL);
1775 pri = fq_if_service_to_priority(fqs, stat->sc);
1776
1777 fq_cl = &grp->fqg_classq[pri];
1778 stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1779 stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1780
1781 if (fq_codel_enable_pacing && fq_codel_enable_l4s &&
1782 fq_if_is_fq_cl_paced(fq_cl, now)) {
1783 stat->packets = 0;
1784 stat->bytes = 0;
1785 }
1786 }
1787
1788 static boolean_t
fq_if_is_grp_all_paced(fq_if_group_t * grp)1789 fq_if_is_grp_all_paced(fq_if_group_t *grp)
1790 {
1791 fq_if_classq_t *fq_cl;
1792 uint64_t now;
1793
1794 if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1795 return false;
1796 }
1797
1798 now = fq_codel_get_time();
1799 for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1800 fq_cl = &grp->fqg_classq[fq_cl_idx];
1801 if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1802 continue;
1803 }
1804 if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1805 return false;
1806 }
1807 }
1808
1809 return true;
1810 }
1811
1812 boolean_t
fq_if_allow_dequeue(struct ifclassq * ifq)1813 fq_if_allow_dequeue(struct ifclassq *ifq)
1814 {
1815 fq_if_group_t *grp;
1816 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1817
1818 IFCQ_LOCK_ASSERT_HELD(ifq);
1819
1820 if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) {
1821 return false;
1822 }
1823
1824 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1825 grp = fqs->fqs_classq_groups[grp_idx];
1826 if (grp == NULL || FQG_BYTES(grp) == 0) {
1827 continue;
1828 }
1829
1830 if (!fq_if_is_grp_all_paced(grp)) {
1831 return false;
1832 }
1833 }
1834
1835 return true;
1836 }
1837
1838 void
fq_if_stat_sc(fq_if_t * fqs,cqrq_stat_sc_t * stat)1839 fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1840 {
1841 cqrq_stat_sc_t grp_sc_stat;
1842 fq_if_group_t *grp;
1843 uint64_t now = fq_codel_get_time();
1844
1845 if (stat == NULL) {
1846 return;
1847 }
1848 grp_sc_stat.sc = stat->sc;
1849 stat->packets = 0;
1850 stat->bytes = 0;
1851
1852 if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1853 if (stat->sc == MBUF_SC_UNSPEC) {
1854 if (!fq_if_allow_dequeue(fqs->fqs_ifq)) {
1855 stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1856 stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1857 }
1858 } else {
1859 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1860 grp = fqs->fqs_classq_groups[grp_idx];
1861 if (grp == NULL) {
1862 continue;
1863 }
1864
1865 fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1866 stat->packets += grp_sc_stat.packets;
1867 stat->bytes += grp_sc_stat.bytes;
1868 }
1869 }
1870 return;
1871 }
1872
1873 if (stat->sc == MBUF_SC_UNSPEC) {
1874 if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1875 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1876 if (fq_if_is_grp_all_paced(grp)) {
1877 continue;
1878 }
1879 stat->packets += FQG_LEN(grp);
1880 stat->bytes += FQG_BYTES(grp);
1881 }
1882 } else {
1883 grp = fq_if_find_grp(fqs, stat->grp_idx);
1884 if (!fq_if_is_grp_all_paced(grp)) {
1885 stat->packets = FQG_LEN(grp);
1886 stat->bytes = FQG_BYTES(grp);
1887 }
1888 }
1889 } else {
1890 if (fq_if_is_grp_combined(fqs, stat->grp_idx)) {
1891 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1892 if (fq_if_is_grp_all_paced(grp)) {
1893 continue;
1894 }
1895 fq_if_grp_stat_sc(fqs, grp, &grp_sc_stat, now);
1896 stat->packets += grp_sc_stat.packets;
1897 stat->bytes += grp_sc_stat.bytes;
1898 }
1899 } else {
1900 grp = fq_if_find_grp(fqs, stat->grp_idx);
1901 fq_if_grp_stat_sc(fqs, grp, stat, now);
1902 }
1903 }
1904 }
1905
1906 int
fq_if_request(struct ifclassq * ifq,cqrq_t rq,void * arg)1907 fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg)
1908 {
1909 int err = 0;
1910 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1911
1912 IFCQ_LOCK_ASSERT_HELD(ifq);
1913
1914 /*
1915 * These are usually slow operations, convert the lock ahead of time
1916 */
1917 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1918 switch (rq) {
1919 case CLASSQRQ_PURGE:
1920 fq_if_purge(fqs);
1921 VERIFY(IFCQ_IS_EMPTY(ifq));
1922 break;
1923 case CLASSQRQ_PURGE_SC:
1924 fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg);
1925 break;
1926 case CLASSQRQ_EVENT:
1927 fq_if_event(fqs, *(cqev_t *)arg);
1928 break;
1929 case CLASSQRQ_THROTTLE:
1930 fq_if_throttle(fqs, (cqrq_throttle_t *)arg);
1931 break;
1932 case CLASSQRQ_STAT_SC:
1933 fq_if_stat_sc(fqs, (cqrq_stat_sc_t *)arg);
1934 break;
1935 }
1936 return err;
1937 }
1938
1939 #if (DEBUG || DEVELOPMENT)
1940 static int
1941 fq_if_configure_target_sysctl SYSCTL_HANDLER_ARGS
1942 {
1943 #pragma unused(oidp, arg2)
1944 fq_if_t *__single fqs = arg1;
1945 uint64_t *target_delay;
1946 uint64_t new_target = 0;
1947 int changed;
1948 int error;
1949
1950 if (fqs->fqs_ifq == NULL || !IFCQ_IS_ENABLED(fqs->fqs_ifq) || fqs->fqs_classq_groups[0] == NULL) {
1951 return ENXIO;
1952 }
1953
1954 target_delay = &fqs->fqs_classq_groups[0]->fqg_target_qdelays[FQ_TFC_C];
1955 error = sysctl_io_number(req, *target_delay,
1956 sizeof(*target_delay), &new_target, &changed);
1957 if (error == 0 && changed != 0) {
1958 *target_delay = new_target;
1959 }
1960 return error;
1961 }
1962 #endif /* (DEBUG || DEVELOPMENT) */
1963
1964
1965
1966 static int
fq_if_setup_common(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype,boolean_t legacy)1967 fq_if_setup_common(struct ifclassq *ifq, u_int32_t flags,
1968 classq_pkt_type_t ptype, boolean_t legacy)
1969 {
1970 fq_if_t *fqs = NULL;
1971 int err = 0;
1972
1973 IFCQ_LOCK_ASSERT_HELD(ifq);
1974 VERIFY(ifq->ifcq_disc == NULL);
1975 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1976
1977 fqs = fq_if_alloc(ifq, ptype);
1978 if (fqs == NULL) {
1979 return ENOMEM;
1980 }
1981 if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1982 fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1983 fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1984 } else {
1985 fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1986 }
1987
1988 if (legacy) {
1989 fqs->fqs_dequeue = fq_codel_dq_legacy;
1990 fqs->fqs_enqueue = fq_codel_enq_legacy;
1991 fqs->fqs_flags |= FQS_LEGACY;
1992 err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1993 } else {
1994 fqs->fqs_dequeue = fq_codel_dq;
1995 fqs->fqs_enqueue = fq_codel_enq;
1996 err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL_NEW, fqs);
1997 }
1998
1999 #if (DEBUG || DEVELOPMENT)
2000 struct ifnet *ifp = ifq->ifcq_ifp;
2001 struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid;
2002 skoid_create(fqs_skoid,
2003 SKOID_SNODE(_net_classq_fq_codel_params), if_name(ifp),
2004 CTLFLAG_RW);
2005 skoid_add_handler((struct skoid *)fqs_skoid, "target_delay", CTLFLAG_RW,
2006 fq_if_configure_target_sysctl, fqs, 0);
2007 #endif /* (DEBUG || DEVELOPMENT) */
2008
2009 if (err != 0) {
2010 os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
2011 "failed to attach fq_if: %d\n", __func__, err);
2012 fq_if_destroy(fqs);
2013 return err;
2014 }
2015
2016 /*
2017 * Always create one group. If qset 0 is added later,
2018 * this group will be updated.
2019 */
2020 err = fq_if_create_grp(ifq, 0, IF_CLASSQ_DEF);
2021 if (err != 0) {
2022 os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
2023 "failed to create a fq group: %d\n", __func__, err);
2024 fq_if_destroy(fqs);
2025 }
2026
2027 return err;
2028 }
2029
2030 int
fq_if_setup_legacy(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)2031 fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags,
2032 classq_pkt_type_t ptype)
2033 {
2034 return fq_if_setup_common(ifq, flags, ptype, true);
2035 }
2036
2037 int
fq_if_setup_new(struct ifclassq * ifq,u_int32_t flags,classq_pkt_type_t ptype)2038 fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags,
2039 classq_pkt_type_t ptype)
2040 {
2041 return fq_if_setup_common(ifq, flags, ptype, false);
2042 }
2043
2044 fq_t *
fq_if_hash_pkt(fq_if_t * fqs,fq_if_group_t * fq_grp,uint32_t flowid,mbuf_svc_class_t svc_class,uint64_t now,uint8_t pkt_proto,uint8_t pkt_flowsrc,fq_tfc_type_t tfc_type,bool create)2045 fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid,
2046 mbuf_svc_class_t svc_class, uint64_t now, uint8_t pkt_proto,
2047 uint8_t pkt_flowsrc, fq_tfc_type_t tfc_type, bool create)
2048 {
2049 fq_t *fq = NULL;
2050 flowq_list_t *fq_list;
2051 fq_if_classq_t *fq_cl;
2052 uint32_t fqs_hash_id;
2053 u_int8_t scidx;
2054
2055 scidx = fq_if_service_to_priority(fqs, svc_class);
2056
2057 fqs_hash_id = fq_if_flow_hash_id(flowid);
2058
2059 fq_list = &fqs->fqs_flows[fqs_hash_id];
2060
2061 LIST_FOREACH(fq, fq_list, fq_hashlink) {
2062 if (fq->fq_flowhash == flowid &&
2063 fq->fq_sc_index == scidx &&
2064 fq->fq_tfc_type == tfc_type &&
2065 fq->fq_group == fq_grp) {
2066 break;
2067 }
2068 }
2069 if (fq == NULL && create) {
2070 /* If the flow is not already on the list, allocate it */
2071 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2072 fq = fq_alloc(fqs->fqs_ptype);
2073 if (fq != NULL) {
2074 fq->fq_flowhash = flowid;
2075 fq->fq_sc_index = scidx;
2076 fq->fq_group = fq_grp;
2077 fq->fq_tfc_type = tfc_type;
2078 fq_cl = &FQ_CLASSQ(fq);
2079 fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
2080 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
2081 fq->fq_next_tx_time = FQ_INVALID_TX_TS;
2082 LIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
2083 fq_cl->fcl_stat.fcl_flows_cnt++;
2084 fq->fq_flags |= fq_codel_enable_ecn ? FQF_ECN_CAPABLE : 0;
2085 if (
2086 #if (DEBUG || DEVELOPMENT)
2087 ifclassq_congestion_feedback &&
2088 #endif /* (DEBUG || DEVELOPMENT) */
2089 tfc_type != FQ_TFC_L4S &&
2090 !(fqs->fqs_flags & FQS_LEGACY) &&
2091 (pkt_proto == IPPROTO_TCP || pkt_proto == IPPROTO_QUIC) &&
2092 (pkt_flowsrc == FLOWSRC_INPCB || pkt_flowsrc == FLOWSRC_CHANNEL)) {
2093 FQ_ENABLE_CONGESTION_FEEDBACK(fq);
2094 fq->fq_flowsrc = pkt_flowsrc;
2095 }
2096 }
2097 KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
2098 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
2099 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
2100 } else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
2101 fq_if_reuse_empty_flow(fqs, fq, now);
2102 }
2103
2104 /*
2105 * If getq time is not set because this is the first packet or after
2106 * idle time, set it now so that we can detect a stall.
2107 */
2108 if (fq != NULL && fq->fq_getqtime == 0) {
2109 fq->fq_getqtime = now;
2110 }
2111
2112 return fq;
2113 }
2114
2115 static void
fq_if_destroy_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq)2116 fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
2117 {
2118 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
2119 LIST_REMOVE(fq, fq_hashlink);
2120 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2121 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
2122 fq_if_flow_feedback(fqs, fq, fq_cl);
2123 }
2124 KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
2125 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
2126 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
2127 fq_destroy(fq, fqs->fqs_ptype);
2128 }
2129
2130 inline boolean_t
fq_if_at_drop_limit(fq_if_t * fqs)2131 fq_if_at_drop_limit(fq_if_t *fqs)
2132 {
2133 return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
2134 TRUE : FALSE;
2135 }
2136
2137 inline boolean_t
fq_if_almost_at_drop_limit(fq_if_t * fqs)2138 fq_if_almost_at_drop_limit(fq_if_t *fqs)
2139 {
2140 /*
2141 * Whether we are above 90% of the queue limit. This is used to tell if we
2142 * can stop flow controlling the largest flow.
2143 */
2144 return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
2145 }
2146
2147 static inline void
fq_if_reuse_empty_flow(fq_if_t * fqs,fq_t * fq,uint64_t now)2148 fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
2149 {
2150 ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
2151 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
2152 STAILQ_NEXT(fq, fq_actlink) = NULL;
2153 fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
2154 fq->fq_empty_purge_time = 0;
2155 fq->fq_getqtime = 0;
2156 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
2157 fqs->fqs_empty_list_cnt--;
2158 fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
2159 fq_cl->fcl_stat.fcl_flows_cnt++;
2160 }
2161
2162 inline void
fq_if_move_to_empty_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)2163 fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
2164 uint64_t now)
2165 {
2166 ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
2167 fq->fq_empty_purge_time = now + fq_empty_purge_delay;
2168 TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
2169 fq->fq_flags |= FQF_EMPTY_FLOW;
2170 FQ_CLEAR_OVERWHELMING(fq);
2171 fqs->fqs_empty_list_cnt++;
2172 /*
2173 * fcl_flows_cnt is used in budget determination for the class.
2174 * empty flow shouldn't contribute to the budget.
2175 */
2176 fq_cl->fcl_stat.fcl_flows_cnt--;
2177 }
2178
2179 static void
fq_if_purge_empty_flow(fq_if_t * fqs,fq_t * fq)2180 fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
2181 {
2182 fq_if_classq_t *fq_cl;
2183 fq_cl = &FQ_CLASSQ(fq);
2184
2185 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
2186 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
2187 fq->fq_flags &= ~FQF_EMPTY_FLOW;
2188 fqs->fqs_empty_list_cnt--;
2189 /* Remove from the hash list and free the flow queue */
2190 fq_if_destroy_flow(fqs, fq_cl, fq);
2191 }
2192
2193 static void
fq_if_purge_empty_flow_list(fq_if_t * fqs,uint64_t now,bool purge_all)2194 fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
2195 {
2196 fq_t *fq, *tmp;
2197 int i = 0;
2198
2199 if (fqs->fqs_empty_list_cnt == 0) {
2200 ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
2201 return;
2202 }
2203
2204 TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
2205 if (!purge_all && ((now < fq->fq_empty_purge_time) ||
2206 (i++ == FQ_EMPTY_PURGE_MAX))) {
2207 break;
2208 }
2209 fq_if_purge_empty_flow(fqs, fq);
2210 }
2211
2212 if (__improbable(purge_all)) {
2213 VERIFY(fqs->fqs_empty_list_cnt == 0);
2214 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
2215 }
2216 }
2217
2218 static void
fq_if_empty_old_flow(fq_if_t * fqs,fq_if_classq_t * fq_cl,fq_t * fq,uint64_t now)2219 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
2220 uint64_t now)
2221 {
2222 /*
2223 * Remove the flow queue from the old flows list.
2224 */
2225 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
2226 fq->fq_flags &= ~FQF_OLD_FLOW;
2227 fq_cl->fcl_stat.fcl_oldflows_cnt--;
2228 VERIFY(fq->fq_bytes == 0);
2229
2230 /* release any flow control */
2231 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
2232 fq_if_flow_feedback(fqs, fq, fq_cl);
2233 }
2234
2235 /* move the flow queue to empty flows list */
2236 fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
2237 }
2238
2239 static void
fq_if_empty_new_flow(fq_t * fq,fq_if_classq_t * fq_cl)2240 fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
2241 {
2242 /* Move to the end of old queue list */
2243 STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
2244 flowq, fq_actlink);
2245 fq->fq_flags &= ~FQF_NEW_FLOW;
2246 fq_cl->fcl_stat.fcl_newflows_cnt--;
2247
2248 STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
2249 fq->fq_flags |= FQF_OLD_FLOW;
2250 fq_cl->fcl_stat.fcl_oldflows_cnt++;
2251 }
2252
2253 inline void
fq_if_drop_packet(fq_if_t * fqs,uint64_t now)2254 fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
2255 {
2256 fq_t *fq = fqs->fqs_large_flow;
2257 fq_if_classq_t *fq_cl;
2258 pktsched_pkt_t pkt;
2259 volatile uint32_t *__single pkt_flags;
2260 uint64_t *__single pkt_timestamp;
2261
2262 if (fq == NULL) {
2263 return;
2264 }
2265 /* queue can not be empty on the largest flow */
2266 VERIFY(!fq_empty(fq, fqs->fqs_ptype));
2267
2268 fq_cl = &FQ_CLASSQ(fq);
2269 _PKTSCHED_PKT_INIT(&pkt);
2270 fq_getq_flow_internal(fqs, fq, &pkt);
2271 ASSERT(pkt.pktsched_ptype != QP_INVALID);
2272
2273 pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
2274 NULL, NULL, NULL);
2275
2276 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2277 *pkt_timestamp = 0;
2278 switch (pkt.pktsched_ptype) {
2279 case QP_MBUF:
2280 *pkt_flags &= ~PKTF_PRIV_GUARDED;
2281 break;
2282 #if SKYWALK
2283 case QP_PACKET:
2284 /* sanity check */
2285 ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
2286 break;
2287 #endif /* SKYWALK */
2288 default:
2289 VERIFY(0);
2290 /* NOTREACHED */
2291 __builtin_unreachable();
2292 }
2293
2294 if (fq_empty(fq, fqs->fqs_ptype)) {
2295 fqs->fqs_large_flow = NULL;
2296 if (fq->fq_flags & FQF_OLD_FLOW) {
2297 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2298 } else {
2299 VERIFY(fq->fq_flags & FQF_NEW_FLOW);
2300 fq_if_empty_new_flow(fq, fq_cl);
2301 }
2302 }
2303 IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
2304
2305 if (__improbable(droptap_verbose > 0)) {
2306 pktsched_drop_pkt(&pkt, fqs->fqs_ifq->ifcq_ifp, DROP_REASON_AQM_DROP,
2307 __func__, __LINE__, 0);
2308 } else {
2309 pktsched_free_pkt(&pkt);
2310 }
2311 fq_cl->fcl_stat.fcl_drop_overflow++;
2312 }
2313
2314 inline void
fq_if_is_flow_heavy(fq_if_t * fqs,fq_t * fq)2315 fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
2316 {
2317 fq_t *prev_fq;
2318
2319 if (fqs->fqs_large_flow != NULL &&
2320 fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2321 fqs->fqs_large_flow = NULL;
2322 }
2323
2324 if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
2325 return;
2326 }
2327
2328 prev_fq = fqs->fqs_large_flow;
2329 if (prev_fq == NULL) {
2330 if (!fq_empty(fq, fqs->fqs_ptype)) {
2331 fqs->fqs_large_flow = fq;
2332 }
2333 return;
2334 } else if (fq->fq_bytes > prev_fq->fq_bytes) {
2335 fqs->fqs_large_flow = fq;
2336 }
2337 }
2338
2339 boolean_t
fq_if_add_fcentry(fq_if_t * fqs,pktsched_pkt_t * pkt,uint8_t flowsrc,fq_t * fq,fq_if_classq_t * fq_cl)2340 fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2341 fq_t *fq, fq_if_classq_t *fq_cl)
2342 {
2343 struct flowadv_fcentry *fce;
2344
2345 #if DEBUG || DEVELOPMENT
2346 if (__improbable(ifclassq_flow_control_adv == 0)) {
2347 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2348 return TRUE;
2349 }
2350 #endif /* DEBUG || DEVELOPMENT */
2351
2352 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2353 if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2354 fce->fce_flowid == fq->fq_flowhash) {
2355 /* Already on flowcontrol list */
2356 return TRUE;
2357 }
2358 }
2359 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2360 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2361 if (fce != NULL) {
2362 /* XXX Add number of bytes in the queue */
2363 STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2364 fq_cl->fcl_stat.fcl_flow_control++;
2365 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2366 "flow: 0x%x, iface: %s, B:%u\n", __func__,
2367 fq_cl->fcl_stat.fcl_flow_control,
2368 fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2369 if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2370 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2371 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2372 fq->fq_bytes, fq->fq_min_qdelay);
2373 }
2374
2375 if (fce != NULL && fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
2376 kern_channel_flowadv_set(fce);
2377 }
2378
2379 return (fce != NULL) ? TRUE : FALSE;
2380 }
2381
2382 static void
fq_if_remove_fcentry(fq_if_t * fqs,struct flowadv_fcentry * fce)2383 fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2384 {
2385 STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2386 STAILQ_NEXT(fce, fce_link) = NULL;
2387 flowadv_add_entry(fce);
2388 }
2389
2390 void
fq_if_flow_feedback(fq_if_t * fqs,fq_t * fq,fq_if_classq_t * fq_cl)2391 fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2392 {
2393 struct flowadv_fcentry *fce = NULL;
2394
2395 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2396 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2397 if (fce->fce_flowid == fq->fq_flowhash) {
2398 break;
2399 }
2400 }
2401 if (fce != NULL) {
2402 fq_cl->fcl_stat.fcl_flow_feedback++;
2403 fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2404 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2405 "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2406 fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2407 fce->fce_flowsrc_type, fce->fce_flowid,
2408 if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2409 fq->fq_bytes);
2410 fq_if_remove_fcentry(fqs, fce);
2411 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2412 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2413 fq->fq_bytes, fq->fq_min_qdelay);
2414 }
2415 fq->fq_flags &= ~FQF_FLOWCTL_ON;
2416 }
2417
2418 boolean_t
fq_if_report_congestion(fq_if_t * fqs,pktsched_pkt_t * pkt,uint32_t congestion_cnt,uint32_t l4s_ce_cnt,uint32_t pkt_cnt)2419 fq_if_report_congestion(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t congestion_cnt,
2420 uint32_t l4s_ce_cnt, uint32_t pkt_cnt)
2421 {
2422 struct flowadv_fcentry *fce;
2423
2424 #if DEBUG || DEVELOPMENT
2425 if (__improbable(ifclassq_flow_control_adv == 0)) {
2426 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2427 return TRUE;
2428 }
2429 #endif /* DEBUG || DEVELOPMENT */
2430
2431 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2432 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2433 if (fce != NULL) {
2434 fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2435 fce->fce_congestion_cnt = congestion_cnt;
2436 fce->l4s_ce_cnt = l4s_ce_cnt;
2437 fce->fce_pkts_since_last_report = pkt_cnt;
2438
2439 flowadv_add_entry(fce);
2440 }
2441 return (fce != NULL) ? TRUE : FALSE;
2442 }
2443
2444
2445 void
fq_if_dequeue_class(fq_if_t * fqs,fq_if_classq_t * fq_cl,uint32_t pktlimit,int64_t bytelimit,classq_pkt_t * top,classq_pkt_t * bottom,uint32_t * retpktcnt,uint32_t * retbytecnt,flowq_dqlist_t * fq_dqlist,bool budget_restricted,uint64_t now,bool * fq_cl_paced,uint64_t * next_tx_time)2446 fq_if_dequeue_class(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2447 int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2448 uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2449 bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2450 uint64_t *next_tx_time)
2451 {
2452 fq_t *fq = NULL, *tfq = NULL;
2453 flowq_stailq_t temp_stailq;
2454 uint32_t pktcnt, bytecnt;
2455 boolean_t qempty, limit_reached = FALSE;
2456 bool all_paced = true;
2457 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2458 fq_getq_flow_t fq_getq_flow_fn;
2459 classq_pkt_t *head, *tail;
2460 uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2461
2462 switch (fqs->fqs_ptype) {
2463 case QP_MBUF:
2464 fq_getq_flow_fn = fq_getq_flow_mbuf;
2465 break;
2466
2467 #if SKYWALK
2468 case QP_PACKET:
2469 fq_getq_flow_fn = fq_getq_flow_kpkt;
2470 break;
2471 #endif /* SKYWALK */
2472
2473 default:
2474 VERIFY(0);
2475 /* NOTREACHED */
2476 __builtin_unreachable();
2477 }
2478
2479 /*
2480 * maximum byte limit should not be greater than the budget for
2481 * this class
2482 */
2483 if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2484 bytelimit = fq_cl->fcl_budget;
2485 }
2486
2487 VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2488 pktcnt = bytecnt = 0;
2489 STAILQ_INIT(&temp_stailq);
2490
2491 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2492 ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2493 FQF_NEW_FLOW);
2494 uint64_t fq_tx_time;
2495 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2496 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2497 if (fq_tx_time < fq_cl_tx_time) {
2498 fq_cl_tx_time = fq_tx_time;
2499 }
2500 continue;
2501 }
2502 all_paced = false;
2503
2504 if (fq_dqlist != NULL) {
2505 if (!fq->fq_in_dqlist) {
2506 fq_dqlist_add(fq_dqlist, fq);
2507 }
2508 head = &fq->fq_dq_head;
2509 tail = &fq->fq_dq_tail;
2510 } else {
2511 ASSERT(!fq->fq_in_dqlist);
2512 head = top;
2513 tail = &last;
2514 }
2515
2516 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2517 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2518
2519 /*
2520 * From RFC 8290:
2521 * if that queue has a negative number of credits (i.e., it has already
2522 * dequeued at least a quantum of bytes), it is given an additional
2523 * quantum of credits, the queue is put onto _the end of_ the list of
2524 * old queues, and the routine selects the next queue and starts again.
2525 */
2526 if (fq->fq_deficit <= 0 || qempty) {
2527 fq->fq_deficit += fq_cl->fcl_quantum;
2528 fq_if_empty_new_flow(fq, fq_cl);
2529 }
2530 //TODO: add credit when it's now paced? so that the fq is trated the same as empty
2531
2532 if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2533 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2534 if (fq_tx_time < fq_cl_tx_time) {
2535 fq_cl_tx_time = fq_tx_time;
2536 }
2537 }
2538
2539 if (limit_reached) {
2540 goto done;
2541 }
2542 }
2543
2544 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2545 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2546 FQF_OLD_FLOW);
2547 bool destroy = true;
2548 uint64_t fq_tx_time;
2549
2550 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2551 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2552 if (fq_tx_time < fq_cl_tx_time) {
2553 fq_cl_tx_time = fq_tx_time;
2554 }
2555 continue;
2556 }
2557 all_paced = false;
2558
2559 if (fq_dqlist != NULL) {
2560 if (!fq->fq_in_dqlist) {
2561 fq_dqlist_add(fq_dqlist, fq);
2562 }
2563 head = &fq->fq_dq_head;
2564 tail = &fq->fq_dq_tail;
2565 destroy = false;
2566 } else {
2567 ASSERT(!fq->fq_in_dqlist);
2568 head = top;
2569 tail = &last;
2570 }
2571
2572 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2573 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2574
2575 if (!fq_tx_time_ready(fqs, fq, now, &fq_tx_time)) {
2576 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2577 if (fq_tx_time < fq_cl_tx_time) {
2578 fq_cl_tx_time = fq_tx_time;
2579 }
2580 }
2581
2582 if (qempty) {
2583 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2584 } else if (fq->fq_deficit <= 0) {
2585 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2586 flowq, fq_actlink);
2587 /*
2588 * Move to the end of the old queues list. We do not
2589 * need to update the flow count since this flow
2590 * will be added to the tail again
2591 */
2592 STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2593 fq->fq_deficit += fq_cl->fcl_quantum;
2594 }
2595 if (limit_reached) {
2596 break;
2597 }
2598 }
2599
2600 done:
2601 if (all_paced) {
2602 fq_cl->fcl_flags |= FCL_PACED;
2603 fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2604 }
2605 if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2606 STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2607 } else if (!STAILQ_EMPTY(&temp_stailq)) {
2608 fq_cl->fcl_old_flows = temp_stailq;
2609 }
2610 if (last.cp_mbuf != NULL) {
2611 VERIFY(top->cp_mbuf != NULL);
2612 if (bottom != NULL) {
2613 *bottom = last;
2614 }
2615 }
2616 if (retpktcnt != NULL) {
2617 *retpktcnt = pktcnt;
2618 }
2619 if (retbytecnt != NULL) {
2620 *retbytecnt = bytecnt;
2621 }
2622 if (fq_cl_paced != NULL) {
2623 *fq_cl_paced = all_paced;
2624 }
2625 if (next_tx_time != NULL) {
2626 *next_tx_time = fq_cl_tx_time;
2627 }
2628 }
2629
2630 void
fq_if_teardown(struct ifclassq * ifq)2631 fq_if_teardown(struct ifclassq *ifq)
2632 {
2633 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2634
2635 IFCQ_LOCK_ASSERT_HELD(ifq);
2636 VERIFY(fqs != NULL);
2637 VERIFY(ifq->ifcq_type == PKTSCHEDT_FQ_CODEL || ifq->ifcq_type == PKTSCHEDT_FQ_CODEL_NEW);
2638 fq_if_destroy(fqs);
2639 ifq->ifcq_disc = NULL;
2640 ifclassq_detach(ifq);
2641 }
2642
2643 static void
fq_export_flowstats(fq_if_t * fqs,fq_t * fq,struct fq_codel_flowstats * flowstat)2644 fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2645 struct fq_codel_flowstats *flowstat)
2646 {
2647 bzero(flowstat, sizeof(*flowstat));
2648 flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2649 flowstat->fqst_bytes = fq->fq_bytes;
2650 flowstat->fqst_flowhash = fq->fq_flowhash;
2651 if (fq->fq_flags & FQF_NEW_FLOW) {
2652 flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2653 }
2654 if (fq->fq_flags & FQF_OLD_FLOW) {
2655 flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2656 }
2657 if (fq->fq_flags & FQF_DELAY_HIGH) {
2658 flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2659 }
2660 if (fq->fq_flags & FQF_FLOWCTL_ON) {
2661 flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2662 }
2663 if (fqs->fqs_large_flow == fq) {
2664 flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2665 }
2666 }
2667
2668 int
fq_if_getqstats(struct ifclassq * ifq,uint8_t gid,u_int32_t qid,struct if_ifclassq_stats * ifqs)2669 fq_if_getqstats(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2670 struct if_ifclassq_stats *ifqs)
2671 {
2672 struct fq_codel_classstats *fcls;
2673 fq_if_classq_t *fq_cl;
2674 fq_if_t *fqs;
2675 fq_t *fq = NULL;
2676 fq_if_group_t *grp;
2677 u_int32_t i, flowstat_cnt;
2678
2679 if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2680 return EINVAL;
2681 }
2682
2683 fqs = (fq_if_t *)ifq->ifcq_disc;
2684 if (fqs->fqs_classq_groups[gid] == NULL) {
2685 return ENXIO;
2686 }
2687
2688 fcls = &ifqs->ifqs_fq_codel_stats;
2689
2690 fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2691 grp = fq_if_find_grp(fqs, gid);
2692
2693 fcls->fcls_pri = fq_cl->fcl_pri;
2694 fcls->fcls_service_class = fq_cl->fcl_service_class;
2695 fcls->fcls_quantum = fq_cl->fcl_quantum;
2696 fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2697 fcls->fcls_budget = fq_cl->fcl_budget;
2698 fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2699 fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2700 fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2701 fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2702 fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2703 fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2704 fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2705 fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2706 fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2707 fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2708 fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2709 fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2710 fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2711 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2712 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2713 fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2714 fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2715 fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2716 fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2717 fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2718 fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2719 fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2720 fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2721 fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2722 fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2723 fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2724 fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2725 fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2726 fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2727 fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2728 fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2729 fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2730 fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2731 fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2732 fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2733 fcls->fcls_high_delay_drop = fq_cl->fcl_stat.fcl_high_delay_drop;
2734 fcls->fcls_congestion_feedback = fq_cl->fcl_stat.fcl_congestion_feedback;
2735
2736 /* Gather per flow stats */
2737 flowstat_cnt = min((fcls->fcls_newflows_cnt +
2738 fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2739 i = 0;
2740 STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2741 if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2742 break;
2743 }
2744
2745 /* leave space for a few old flows */
2746 if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2747 i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2748 break;
2749 }
2750 fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2751 i++;
2752 }
2753 STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2754 if (i >= flowstat_cnt) {
2755 break;
2756 }
2757 fq_export_flowstats(fqs, fq, &fcls->fcls_flowstats[i]);
2758 i++;
2759 }
2760 VERIFY(i <= flowstat_cnt);
2761 fcls->fcls_flowstats_cnt = i;
2762 return 0;
2763 }
2764
2765 int
fq_if_create_grp(struct ifclassq * ifcq,uint8_t grp_idx,uint8_t flags)2766 fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2767 {
2768 #define _FQ_CLASSQ_INIT(_grp, _s, _q) \
2769 fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX, \
2770 FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s), \
2771 MBUF_SC_ ## _s );
2772
2773 fq_if_group_t *grp;
2774 fq_if_t *fqs;
2775 uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2776 struct ifnet *ifp = ifcq->ifcq_ifp;
2777
2778 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2779
2780 fqs = (fq_if_t *)ifcq->ifcq_disc;
2781
2782 if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2783 grp = fqs->fqs_classq_groups[grp_idx];
2784 goto update;
2785 }
2786
2787 if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2788 return EINVAL;
2789 }
2790
2791 grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2792 if (grp == NULL) {
2793 return ENOMEM;
2794 }
2795
2796 fqs->fqs_classq_groups[grp_idx] = grp;
2797 grp->fqg_index = grp_idx;
2798
2799 quantum = fq_if_calc_quantum(ifp);
2800 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2801 _FQ_CLASSQ_INIT(grp, BK, quantum);
2802 _FQ_CLASSQ_INIT(grp, BE, quantum);
2803 _FQ_CLASSQ_INIT(grp, VI, quantum);
2804 _FQ_CLASSQ_INIT(grp, VO, quantum);
2805 } else {
2806 /* SIG shares same INDEX with VI */
2807 static_assert(SCIDX_SIG == SCIDX_VI);
2808 static_assert(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2809
2810 _FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2811 _FQ_CLASSQ_INIT(grp, BK, quantum);
2812 _FQ_CLASSQ_INIT(grp, BE, quantum);
2813 _FQ_CLASSQ_INIT(grp, RD, quantum);
2814 _FQ_CLASSQ_INIT(grp, OAM, quantum);
2815 _FQ_CLASSQ_INIT(grp, AV, quantum);
2816 _FQ_CLASSQ_INIT(grp, RV, quantum);
2817 _FQ_CLASSQ_INIT(grp, VI, quantum);
2818 _FQ_CLASSQ_INIT(grp, VO, quantum);
2819 _FQ_CLASSQ_INIT(grp, CTL, quantum);
2820 }
2821
2822 update:
2823 if (flags & IF_DEFAULT_GRP) {
2824 fq_if_set_grp_combined(ifcq, grp_idx);
2825 grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2826 } else {
2827 fq_if_set_grp_separated(ifcq, grp_idx);
2828 grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2829 }
2830
2831 calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2832 fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C],
2833 calc_flags);
2834 fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S],
2835 calc_flags | IF_CLASSQ_L4S);
2836
2837 fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C],
2838 calc_flags);
2839 fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S],
2840 calc_flags | IF_CLASSQ_L4S);
2841
2842 return 0;
2843 #undef _FQ_CLASSQ_INIT
2844 }
2845
2846 fq_if_group_t *
fq_if_find_grp(fq_if_t * fqs,uint8_t grp_idx)2847 fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2848 {
2849 fq_if_group_t *grp;
2850
2851 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2852 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2853
2854 grp = fqs->fqs_classq_groups[grp_idx];
2855 VERIFY(grp != NULL);
2856
2857 return grp;
2858 }
2859
2860 static void
fq_if_purge_grp(fq_if_t * fqs,fq_if_group_t * grp)2861 fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2862 {
2863 for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2864 fq_if_purge_classq(fqs, &grp->fqg_classq[i]);
2865 }
2866
2867 bzero(&grp->fqg_bitmaps, sizeof(grp->fqg_bitmaps));
2868 grp->fqg_len = 0;
2869 grp->fqg_bytes = 0;
2870 fq_if_set_grp_separated(fqs->fqs_ifq, grp->fqg_index);
2871 }
2872
2873 void
fq_if_destroy_grps(fq_if_t * fqs)2874 fq_if_destroy_grps(fq_if_t *fqs)
2875 {
2876 fq_if_group_t *__single grp;
2877
2878 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2879
2880 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2881 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2882 continue;
2883 }
2884
2885 grp = fq_if_find_grp(fqs, grp_idx);
2886 fq_if_purge_grp(fqs, grp);
2887 zfree(fq_if_grp_zone, grp);
2888 fqs->fqs_classq_groups[grp_idx] = NULL;
2889 }
2890 }
2891
2892 static inline boolean_t
fq_if_is_grp_combined(fq_if_t * fqs,uint8_t grp_idx)2893 fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2894 {
2895 return pktsched_bit_tst(grp_idx, &fqs->fqs_combined_grp_bitmap);
2896 }
2897
2898 void
fq_if_set_grp_combined(struct ifclassq * ifcq,uint8_t grp_idx)2899 fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2900 {
2901 fq_if_t *fqs;
2902 fq_if_group_t *grp;
2903
2904 IFCQ_LOCK_ASSERT_HELD(ifcq);
2905
2906 fqs = (fq_if_t *)ifcq->ifcq_disc;
2907 grp = fq_if_find_grp(fqs, grp_idx);
2908
2909 if (fq_if_is_grp_combined(fqs, grp_idx)) {
2910 return;
2911 }
2912
2913 /*
2914 * We keep the current fq_deficit and fcl_budget when combining a group.
2915 * That might disrupt the AQM but only for a moment.
2916 */
2917 pktsched_bit_set(grp_idx, &fqs->fqs_combined_grp_bitmap);
2918 TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2919 }
2920
2921 void
fq_if_set_grp_separated(struct ifclassq * ifcq,uint8_t grp_idx)2922 fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2923 {
2924 fq_if_t *fqs;
2925 fq_if_group_t *grp;
2926
2927 IFCQ_LOCK_ASSERT_HELD(ifcq);
2928
2929 fqs = (fq_if_t *)ifcq->ifcq_disc;
2930 grp = fq_if_find_grp(fqs, grp_idx);
2931
2932 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2933 return;
2934 }
2935
2936 pktsched_bit_clr(grp_idx, &fqs->fqs_combined_grp_bitmap);
2937 TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2938 }
2939