1 /*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <net/if_var.h>
30 #include <net/if_var_private.h>
31 #include <net/dlil_var_private.h>
32 #include <net/dlil.h>
33 #include <net/dlil_sysctl.h>
34
35
36 #define DLIL_EWMA(old, new, decay) do { \
37 u_int32_t _avg; \
38 if ((_avg = (old)) > 0) \
39 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
40 else \
41 _avg = (new); \
42 (old) = _avg; \
43 } while (0)
44
45
46 /*
47 * Detect whether a queue contains a burst that needs to be trimmed.
48 */
49 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
50 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
51 qtype(q) == QP_MBUF)
52
53
54 /* rate limit debug messages */
55 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
56
57 extern void proto_input_run(void);
58
59 static errno_t dlil_input_async(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
60 static errno_t dlil_input_sync(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
61 static void dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, protocol_family_t pf);
62 static void dlil_input_packet_list_common(struct ifnet *, mbuf_ref_t, u_int32_t, ifnet_model_t, boolean_t);
63 static void dlil_input_thread_func(void *, wait_result_t);
64 static void dlil_input_thread_cont(void *, wait_result_t);
65 static inline void dlil_input_wakeup(struct dlil_threading_info *inp);
66
67 static int dlil_interface_filters_input(struct ifnet *, mbuf_ref_ref_t, char **, protocol_family_t, boolean_t);
68
69 static void dlil_main_input_thread_func(void *, wait_result_t);
70 static void dlil_main_input_thread_cont(void *, wait_result_t);
71
72 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
73 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
74
75 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue, dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta);
76
77 static inline mbuf_t handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt);
78 /*
79 * Publicly visible functions.
80 */
81
82 int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)83 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
84 thread_continue_t *thfunc)
85 {
86 boolean_t dlil_rxpoll_input;
87 thread_continue_t func = NULL;
88 u_int32_t limit;
89 int error = 0;
90
91 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
92 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
93
94 /* default strategy utilizes the DLIL worker thread */
95 inp->dlth_strategy = dlil_input_async;
96
97 /* NULL ifp indicates the main input thread, called at dlil_init time */
98 if (ifp == NULL) {
99 /*
100 * Main input thread only.
101 */
102 func = dlil_main_input_thread_func;
103 VERIFY(inp == dlil_main_input_thread);
104 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
105 "main_input");
106 } else if (dlil_rxpoll_input) {
107 /*
108 * Legacy (non-netif) hybrid polling.
109 */
110 func = dlil_rxpoll_input_thread_func;
111 VERIFY(inp != dlil_main_input_thread);
112 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
113 "%s_input_poll", if_name(ifp));
114 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
115 /*
116 * Asynchronous strategy.
117 */
118 func = dlil_input_thread_func;
119 VERIFY(inp != dlil_main_input_thread);
120 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
121 "%s_input", if_name(ifp));
122 } else {
123 /*
124 * Synchronous strategy if there's a netif below and
125 * the device isn't capable of hybrid polling.
126 */
127 ASSERT(func == NULL);
128 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
129 VERIFY(inp != dlil_main_input_thread);
130 ASSERT(!inp->dlth_affinity);
131 inp->dlth_strategy = dlil_input_sync;
132 inp->dlth_name = __unsafe_null_terminated_from_indexable(inp->dlth_name_storage);
133 }
134 VERIFY(inp->dlth_thread == THREAD_NULL);
135
136 /* let caller know */
137 if (thfunc != NULL) {
138 *thfunc = func;
139 }
140
141 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
142 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
143
144 inp->dlth_ifp = ifp; /* NULL for main input thread */
145
146 /*
147 * For interfaces that support opportunistic polling, set the
148 * low and high watermarks for outstanding inbound packets/bytes.
149 * Also define freeze times for transitioning between modes
150 * and updating the average.
151 */
152 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
153 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
154 if (ifp->if_xflags & IFXF_LEGACY) {
155 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
156 }
157 } else {
158 /*
159 * For interfaces that don't support opportunistic
160 * polling, set the burst limit to prevent memory exhaustion.
161 * The values of `if_rcvq_burst_limit' are safeguarded
162 * on customer builds by `sysctl_rcvq_burst_limit'.
163 */
164 limit = if_rcvq_burst_limit;
165 }
166
167 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
168 if (inp == dlil_main_input_thread) {
169 dlil_main_threading_info_ref_t inpm =
170 __container_of(inp, struct dlil_main_threading_info, inp);
171 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
172 }
173
174 if (func == NULL) {
175 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
176 ASSERT(error == 0);
177 error = ENODEV;
178 goto done;
179 }
180
181 error = kernel_thread_start(func, inp, &inp->dlth_thread);
182 if (error == KERN_SUCCESS) {
183 thread_precedence_policy_data_t info;
184 __unused kern_return_t kret;
185
186 bzero(&info, sizeof(info));
187 info.importance = 0;
188 kret = thread_policy_set(inp->dlth_thread,
189 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
190 THREAD_PRECEDENCE_POLICY_COUNT);
191 ASSERT(kret == KERN_SUCCESS);
192 /*
193 * We create an affinity set so that the matching workloop
194 * thread or the starter thread (for loopback) can be
195 * scheduled on the same processor set as the input thread.
196 */
197 if (net_affinity) {
198 struct thread *tp __single = inp->dlth_thread;
199 u_int32_t tag;
200 /*
201 * Randomize to reduce the probability
202 * of affinity tag namespace collision.
203 */
204 read_frandom(&tag, sizeof(tag));
205 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
206 thread_reference(tp);
207 inp->dlth_affinity_tag = tag;
208 inp->dlth_affinity = TRUE;
209 }
210 }
211 } else if (inp == dlil_main_input_thread) {
212 panic_plain("%s: couldn't create main input thread", __func__);
213 /* NOTREACHED */
214 } else {
215 panic_plain("%s: couldn't create %s input thread", __func__,
216 if_name(ifp));
217 /* NOTREACHED */
218 }
219 OSAddAtomic(1, &cur_dlil_input_threads);
220
221 done:
222 return error;
223 }
224
225 void
dlil_terminate_input_thread(struct dlil_threading_info * inp)226 dlil_terminate_input_thread(struct dlil_threading_info *inp)
227 {
228 ifnet_ref_t ifp = inp->dlth_ifp;
229 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
230
231 VERIFY(current_thread() == inp->dlth_thread);
232 VERIFY(inp != dlil_main_input_thread);
233
234 OSAddAtomic(-1, &cur_dlil_input_threads);
235
236 #if TEST_INPUT_THREAD_TERMINATION
237 { /* do something useless that won't get optimized away */
238 uint32_t v = 1;
239 for (uint32_t i = 0;
240 i < if_input_thread_termination_spin;
241 i++) {
242 v = (i + 1) * v;
243 }
244 DLIL_PRINTF("the value is %d\n", v);
245 }
246 #endif /* TEST_INPUT_THREAD_TERMINATION */
247
248 lck_mtx_lock_spin(&inp->dlth_lock);
249 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
250 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
251 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
252 wakeup_one((caddr_t)&inp->dlth_flags);
253 lck_mtx_unlock(&inp->dlth_lock);
254
255 /* free up pending packets */
256 if (pkt.cp_mbuf != NULL) {
257 mbuf_freem_list(pkt.cp_mbuf);
258 }
259
260 /* for the extra refcnt from kernel_thread_start() */
261 thread_deallocate(current_thread());
262
263 if (dlil_verbose) {
264 DLIL_PRINTF("%s: input thread terminated\n",
265 if_name(ifp));
266 }
267
268 /* this is the end */
269 thread_terminate(current_thread());
270 /* NOTREACHED */
271 }
272
273 boolean_t
dlil_is_rxpoll_input(thread_continue_t func)274 dlil_is_rxpoll_input(thread_continue_t func)
275 {
276 return func == dlil_rxpoll_input_thread_func;
277 }
278
279 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)280 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
281 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
282 boolean_t poll, struct thread *tp)
283 {
284 dlil_threading_info_ref_t inp = ifp->if_inp;
285
286 if (__improbable(inp == NULL)) {
287 inp = dlil_main_input_thread;
288 }
289
290 #if (DEVELOPMENT || DEBUG)
291 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
292 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
293 } else
294 #endif /* (DEVELOPMENT || DEBUG) */
295 {
296 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
297 }
298 }
299
300 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)301 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
302 {
303 return dlil_input_packet_list_common(ifp, m, 0,
304 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
305 }
306
307 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)308 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
309 u_int32_t cnt, ifnet_model_t mode)
310 {
311 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
312 }
313
314 /*
315 * Static function implementations.
316 */
317 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_ref_t m)318 dlil_ifproto_input(struct if_proto * ifproto, mbuf_ref_t m)
319 {
320 int error;
321
322 if (ifproto->proto_kpi == kProtoKPI_v1) {
323 /* Version 1 protocols get one packet at a time */
324 while (m != NULL) {
325 /*
326 * Version 1 KPI does not accept header len,
327 * hence the pointer to the frame header must be `__single'.
328 */
329 char *frame_header_ptr __single;
330
331 mbuf_t next_packet;
332
333 next_packet = m->m_nextpkt;
334 m->m_nextpkt = NULL;
335 frame_header_ptr = m->m_pkthdr.pkt_hdr;
336
337 m->m_pkthdr.pkt_hdr = NULL;
338 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
339 ifproto->protocol_family, m, frame_header_ptr);
340 if (error != 0 && error != EJUSTRETURN) {
341 m_drop_if(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
342 }
343 m = next_packet;
344 }
345 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
346 /* Version 2 protocols support packet lists */
347 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
348 ifproto->protocol_family, m);
349 if (error != 0 && error != EJUSTRETURN) {
350 m_drop_list(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
351 }
352 }
353 }
354
355 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)356 dlil_input_async(struct dlil_threading_info *inp,
357 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
358 const struct ifnet_stat_increment_param *s, boolean_t poll,
359 struct thread *tp)
360 {
361 u_int32_t m_cnt = s->packets_in;
362 u_int32_t m_size = s->bytes_in;
363 boolean_t notify = FALSE;
364 struct ifnet_stat_increment_param s_adj = *s;
365 dlil_freeq_t freeq;
366 MBUFQ_INIT(&freeq);
367
368 /*
369 * If there is a matching DLIL input thread associated with an
370 * affinity set, associate this thread with the same set. We
371 * will only do this once.
372 */
373 lck_mtx_lock_spin(&inp->dlth_lock);
374 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
375 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
376 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
377 u_int32_t tag = inp->dlth_affinity_tag;
378
379 if (poll) {
380 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
381 inp->dlth_poller_thread = tp;
382 } else {
383 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
384 inp->dlth_driver_thread = tp;
385 }
386 lck_mtx_unlock(&inp->dlth_lock);
387
388 /* Associate the current thread with the new affinity tag */
389 (void) dlil_affinity_set(tp, tag);
390
391 /*
392 * Take a reference on the current thread; during detach,
393 * we will need to refer to it in order to tear down its
394 * affinity.
395 */
396 thread_reference(tp);
397 lck_mtx_lock_spin(&inp->dlth_lock);
398 }
399
400 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
401
402 /*
403 * Because of loopbacked multicast we cannot stuff the ifp in
404 * the rcvif of the packet header: loopback (lo0) packets use a
405 * dedicated list so that we can later associate them with lo_ifp
406 * on their way up the stack. Packets for other interfaces without
407 * dedicated input threads go to the regular list.
408 */
409 if (m_head != NULL) {
410 classq_pkt_t head, tail;
411 class_queue_t *input_queue;
412 CLASSQ_PKT_INIT_MBUF(&head, m_head);
413 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
414 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
415 dlil_main_threading_info_ref_t inpm =
416 __container_of(inp, struct dlil_main_threading_info, inp);
417 input_queue = &inpm->lo_rcvq_pkts;
418 } else {
419 input_queue = &inp->dlth_pkts;
420 }
421
422 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
423
424 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
425 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
426 inp->dlth_trim_pkts_dropped += s_adj.dropped;
427 inp->dlth_trim_cnt += 1;
428
429 os_log_error(OS_LOG_DEFAULT,
430 "%s %s burst limit %u (sysctl: %u) exceeded. "
431 "%u packets dropped [%u total in %u events]. new qlen %u ",
432 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
433 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
434 qlen(input_queue));
435 }
436 }
437
438 #if IFNET_INPUT_SANITY_CHK
439 /*
440 * Verify that the original stat increment parameter
441 * accurately describes the input chain `m_head`.
442 * This is not affected by the trimming of input queue.
443 */
444 if (__improbable(dlil_input_sanity_check != 0)) {
445 u_int32_t count = 0, size = 0;
446 struct mbuf *m0;
447
448 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
449 m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
450 size += m_length(m0);
451 count++;
452 }
453
454 if (count != m_cnt) {
455 panic_plain("%s: invalid total packet count %u "
456 "(expected %u)\n", if_name(ifp), count, m_cnt);
457 /* NOTREACHED */
458 __builtin_unreachable();
459 } else if (size != m_size) {
460 panic_plain("%s: invalid total packet size %u "
461 "(expected %u)\n", if_name(ifp), size, m_size);
462 /* NOTREACHED */
463 __builtin_unreachable();
464 }
465
466 inp->dlth_pkts_cnt += m_cnt;
467 }
468 #else
469 m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
470 #endif /* IFNET_INPUT_SANITY_CHK */
471
472 /* NOTE: use the adjusted parameter, vs the original one */
473 dlil_input_stats_add(&s_adj, inp, ifp, poll);
474 /*
475 * If we're using the main input thread, synchronize the
476 * stats now since we have the interface context. All
477 * other cases involving dedicated input threads will
478 * have their stats synchronized there.
479 */
480 if (inp == dlil_main_input_thread) {
481 notify = dlil_input_stats_sync(ifp, inp);
482 }
483
484 dlil_input_wakeup(inp);
485 lck_mtx_unlock(&inp->dlth_lock);
486
487 /*
488 * Actual freeing of the excess packets must happen
489 * after the dlth_lock had been released.
490 */
491 if (!MBUFQ_EMPTY(&freeq)) {
492 m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
493 }
494
495 if (notify) {
496 ifnet_notify_data_threshold(ifp);
497 }
498
499 return 0;
500 }
501
502 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)503 dlil_input_sync(struct dlil_threading_info *inp,
504 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
505 const struct ifnet_stat_increment_param *s, boolean_t poll,
506 struct thread *tp)
507 {
508 #pragma unused(tp)
509 u_int32_t m_cnt = s->packets_in;
510 u_int32_t m_size = s->bytes_in;
511 boolean_t notify = FALSE;
512 classq_pkt_t head, tail;
513 struct ifnet_stat_increment_param s_adj = *s;
514 dlil_freeq_t freeq;
515 MBUFQ_INIT(&freeq);
516
517 ASSERT(inp != dlil_main_input_thread);
518
519 /* XXX: should we just assert instead? */
520 if (__improbable(m_head == NULL)) {
521 return 0;
522 }
523
524 CLASSQ_PKT_INIT_MBUF(&head, m_head);
525 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
526
527 lck_mtx_lock_spin(&inp->dlth_lock);
528 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
529
530 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
531 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
532 inp->dlth_trim_pkts_dropped += s_adj.dropped;
533 inp->dlth_trim_cnt += 1;
534
535 os_log_error(OS_LOG_DEFAULT,
536 "%s %s burst limit %u (sysctl: %u) exceeded. "
537 "%u packets dropped [%u total in %u events]. new qlen %u \n",
538 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
539 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
540 qlen(&inp->dlth_pkts));
541 }
542
543 #if IFNET_INPUT_SANITY_CHK
544 if (__improbable(dlil_input_sanity_check != 0)) {
545 u_int32_t count = 0, size = 0;
546 struct mbuf *m0;
547
548 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
549 m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
550 size += m_length(m0);
551 count++;
552 }
553
554 if (count != m_cnt) {
555 panic_plain("%s: invalid total packet count %u "
556 "(expected %u)\n", if_name(ifp), count, m_cnt);
557 /* NOTREACHED */
558 __builtin_unreachable();
559 } else if (size != m_size) {
560 panic_plain("%s: invalid total packet size %u "
561 "(expected %u)\n", if_name(ifp), size, m_size);
562 /* NOTREACHED */
563 __builtin_unreachable();
564 }
565
566 inp->dlth_pkts_cnt += m_cnt;
567 }
568 #else
569 m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
570 #endif /* IFNET_INPUT_SANITY_CHK */
571
572 /* NOTE: use the adjusted parameter, vs the original one */
573 dlil_input_stats_add(&s_adj, inp, ifp, poll);
574
575 m_cnt = qlen(&inp->dlth_pkts);
576 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
577
578 #if SKYWALK
579 /*
580 * If this interface is attached to a netif nexus,
581 * the stats are already incremented there; otherwise
582 * do it here.
583 */
584 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
585 #endif /* SKYWALK */
586 notify = dlil_input_stats_sync(ifp, inp);
587
588 lck_mtx_unlock(&inp->dlth_lock);
589
590 /*
591 * Actual freeing of the excess packets must happen
592 * after the dlth_lock had been released.
593 */
594 if (!MBUFQ_EMPTY(&freeq)) {
595 m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
596 }
597
598 if (notify) {
599 ifnet_notify_data_threshold(ifp);
600 }
601
602 /*
603 * NOTE warning %%% attention !!!!
604 * We should think about putting some thread starvation
605 * safeguards if we deal with long chains of packets.
606 */
607 if (head.cp_mbuf != NULL) {
608 dlil_input_packet_list_extended(ifp, head.cp_mbuf,
609 m_cnt, ifp->if_poll_mode);
610 }
611
612 return 0;
613 }
614
615 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)616 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
617 protocol_family_t pf)
618 {
619 uint16_t sum = 0;
620 uint32_t hlen;
621
622 if (frame_header == NULL ||
623 frame_header < (char *)mbuf_datastart(m) ||
624 frame_header > (char *)m->m_data) {
625 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
626 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
627 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
628 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
629 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
630 (uint64_t)VM_KERNEL_ADDRPERM(m));
631 return;
632 }
633 hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
634
635 switch (pf) {
636 case PF_INET:
637 case PF_INET6:
638 break;
639 default:
640 return;
641 }
642
643 /*
644 * Force partial checksum offload; useful to simulate cases
645 * where the hardware does not support partial checksum offload,
646 * in order to validate correctness throughout the layers above.
647 */
648 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
649 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
650
651 if (foff > (uint32_t)m->m_pkthdr.len) {
652 return;
653 }
654
655 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
656
657 /* Compute 16-bit 1's complement sum from forced offset */
658 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
659
660 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
661 m->m_pkthdr.csum_rx_val = sum;
662 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
663
664 hwcksum_dbg_partial_forced++;
665 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
666 }
667
668 /*
669 * Partial checksum offload verification (and adjustment);
670 * useful to validate and test cases where the hardware
671 * supports partial checksum offload.
672 */
673 if ((m->m_pkthdr.csum_flags &
674 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
675 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
676 uint32_t rxoff;
677
678 /* Start offset must begin after frame header */
679 rxoff = m->m_pkthdr.csum_rx_start;
680 if (hlen > rxoff) {
681 hwcksum_dbg_bad_rxoff++;
682 if (dlil_verbose) {
683 DLIL_PRINTF("%s: partial cksum start offset %d "
684 "is less than frame header length %d for "
685 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
686 (uint64_t)VM_KERNEL_ADDRPERM(m));
687 }
688 return;
689 }
690 rxoff -= hlen;
691
692 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
693 /*
694 * Compute the expected 16-bit 1's complement sum;
695 * skip this if we've already computed it above
696 * when partial checksum offload is forced.
697 */
698 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
699
700 /* Hardware or driver is buggy */
701 if (sum != m->m_pkthdr.csum_rx_val) {
702 hwcksum_dbg_bad_cksum++;
703 if (dlil_verbose) {
704 DLIL_PRINTF("%s: bad partial cksum value "
705 "0x%x (expected 0x%x) for mbuf "
706 "0x%llx [rx_start %d]\n",
707 if_name(ifp),
708 m->m_pkthdr.csum_rx_val, sum,
709 (uint64_t)VM_KERNEL_ADDRPERM(m),
710 m->m_pkthdr.csum_rx_start);
711 }
712 return;
713 }
714 }
715 hwcksum_dbg_verified++;
716
717 /*
718 * This code allows us to emulate various hardwares that
719 * perform 16-bit 1's complement sum beginning at various
720 * start offset values.
721 */
722 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
723 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
724
725 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
726 return;
727 }
728
729 sum = m_adj_sum16(m, rxoff, aoff,
730 m_pktlen(m) - aoff, sum);
731
732 m->m_pkthdr.csum_rx_val = sum;
733 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
734
735 hwcksum_dbg_adjusted++;
736 }
737 }
738 }
739
740 #if (DEVELOPMENT || DEBUG)
741 static void
dlil_input_process_wake_packet(ifnet_t ifp,protocol_family_t protocol_family,mbuf_ref_t m)742 dlil_input_process_wake_packet(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m)
743 {
744 /*
745 * For testing we do not care about broadcast and multicast packets as
746 * they are not as controllable as unicast traffic
747 */
748 if (check_wake_mbuf(ifp, protocol_family, m) == false) {
749 return;
750 }
751 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
752 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
753 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
754 /*
755 * This is a one-shot command
756 */
757 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
758
759 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
760 }
761 }
762 }
763 #endif /* (DEVELOPMENT || DEBUG) */
764
765 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,mbuf_ref_t m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)766 dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m,
767 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
768 {
769 int error = 0;
770 protocol_family_t protocol_family;
771 mbuf_t next_packet;
772 ifnet_t ifp = ifp_param;
773 char *__single frame_header = NULL;
774 if_proto_ref_t last_ifproto = NULL;
775 mbuf_t pkt_first = NULL;
776 mbuf_t *pkt_next = NULL;
777 u_int32_t poll_thresh = 0, poll_ival = 0;
778 int iorefcnt = 0;
779 boolean_t skip_bridge_filter = FALSE;
780
781 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
782
783 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
784 (poll_ival = if_rxpoll_interval_pkts) > 0) {
785 poll_thresh = cnt;
786 }
787 if (bridge_enable_early_input != 0 &&
788 ifp != NULL && ifp->if_bridge != NULL) {
789 m = handle_bridge_early_input(ifp, m, cnt);
790 skip_bridge_filter = TRUE;
791 }
792 while (m != NULL) {
793 if_proto_ref_t ifproto = NULL;
794 uint32_t pktf_mask; /* pkt flags to preserve */
795
796 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
797 m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
798
799 if (ifp_param == NULL) {
800 ifp = m->m_pkthdr.rcvif;
801 }
802
803 if ((ifp->if_eflags & IFEF_RXPOLL) &&
804 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
805 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
806 ifnet_poll(ifp);
807 }
808
809 /* Check if this mbuf looks valid */
810 MBUF_INPUT_CHECK(m, ifp);
811
812 next_packet = m->m_nextpkt;
813 m->m_nextpkt = NULL;
814 frame_header = m->m_pkthdr.pkt_hdr;
815 m->m_pkthdr.pkt_hdr = NULL;
816
817 /*
818 * Get an IO reference count if the interface is not
819 * loopback (lo0) and it is attached; lo0 never goes
820 * away, so optimize for that.
821 */
822 if (ifp != lo_ifp) {
823 /* iorefcnt is 0 if it hasn't been taken yet */
824 if (iorefcnt == 0) {
825 if (!ifnet_datamov_begin(ifp)) {
826 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_DATAMOV_BEGIN, NULL, 0);
827 goto next;
828 }
829 }
830 iorefcnt = 1;
831 /*
832 * Preserve the time stamp and skip pktap flags.
833 */
834 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
835 } else {
836 /*
837 * If this arrived on lo0, preserve interface addr
838 * info to allow for connectivity between loopback
839 * and local interface addresses.
840 */
841 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
842 }
843 pktf_mask |= PKTF_WAKE_PKT;
844
845 /* make sure packet comes in clean */
846 m_classifier_init(m, pktf_mask);
847
848 ifp_inc_traffic_class_in(ifp, m);
849
850 /* find which protocol family this packet is for */
851 ifnet_lock_shared(ifp);
852 error = (*ifp->if_demux)(ifp, m, frame_header,
853 &protocol_family);
854 ifnet_lock_done(ifp);
855 if (error != 0) {
856 if (error == EJUSTRETURN) {
857 goto next;
858 }
859 protocol_family = 0;
860 }
861 /* check for an updated frame header */
862 if (m->m_pkthdr.pkt_hdr != NULL) {
863 frame_header = m->m_pkthdr.pkt_hdr;
864 m->m_pkthdr.pkt_hdr = NULL;
865 }
866
867 #if (DEVELOPMENT || DEBUG)
868 /* For testing only */
869 dlil_input_process_wake_packet(ifp, protocol_family, m);
870 #endif /* (DEVELOPMENT || DEBUG) */
871
872 pktap_input(ifp, protocol_family, m, frame_header);
873
874 /* Drop v4 packets received on CLAT46 enabled cell interface */
875 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
876 ifp->if_type == IFT_CELLULAR) {
877 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
878 ip6stat.ip6s_clat464_in_v4_drop++;
879 goto next;
880 }
881
882 /* Translate the packet if it is received on CLAT interface */
883 if ((m->m_flags & M_PROMISC) == 0 &&
884 protocol_family == PF_INET6 &&
885 IS_INTF_CLAT46(ifp) &&
886 dlil_is_clat_needed(protocol_family, m)) {
887 char *data = NULL;
888 struct ether_header eh;
889 struct ether_header *ehp = NULL;
890
891 if (ifp->if_type == IFT_ETHER) {
892 ehp = (struct ether_header *)(void *)frame_header;
893 /* Skip RX Ethernet packets if they are not IPV6 */
894 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
895 goto skip_clat;
896 }
897
898 /* Keep a copy of frame_header for Ethernet packets */
899 char *fh = __unsafe_forge_bidi_indexable(char *, m->m_pkthdr.pkt_hdr, ifnet_hdrlen(ifp));
900 if (fh) {
901 bcopy(fh, (caddr_t)&eh, ETHER_HDR_LEN);
902 }
903 }
904 error = dlil_clat64(ifp, &protocol_family, &m);
905 data = mtod(m, char*);
906 if (error != 0) {
907 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
908 ip6stat.ip6s_clat464_in_drop++;
909 goto next;
910 }
911 /* Native v6 should be No-op */
912 if (protocol_family != PF_INET) {
913 goto skip_clat;
914 }
915
916 /* Do this only for translated v4 packets. */
917 switch (ifp->if_type) {
918 case IFT_CELLULAR:
919 frame_header = data;
920 break;
921 case IFT_ETHER:
922 /*
923 * Drop if the mbuf doesn't have enough
924 * space for Ethernet header
925 */
926 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
927 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
928 ip6stat.ip6s_clat464_in_drop++;
929 goto next;
930 }
931 /*
932 * Set the frame_header ETHER_HDR_LEN bytes
933 * preceeding the data pointer. Change
934 * the ether_type too.
935 * N.B. The variable `fh' is needed because
936 * the `frame_header' variable is `__single',
937 * and hence would not be appropriate for use with `bcopy'.
938 */
939 char *fh = data - ETHER_HDR_LEN;
940 frame_header = fh;
941 eh.ether_type = htons(ETHERTYPE_IP);
942 bcopy((caddr_t)&eh, fh, ETHER_HDR_LEN);
943 break;
944 }
945 }
946 skip_clat:
947 /*
948 * Match the wake packet against the list of ports that has been
949 * been queried by the driver before the device went to sleep
950 */
951 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
952 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
953 if_ports_used_match_mbuf(ifp, protocol_family, m);
954 }
955 }
956 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
957 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
958 dlil_input_cksum_dbg(ifp, m, frame_header,
959 protocol_family);
960 }
961 /*
962 * For partial checksum offload, we expect the driver to
963 * set the start offset indicating the start of the span
964 * that is covered by the hardware-computed checksum;
965 * adjust this start offset accordingly because the data
966 * pointer has been advanced beyond the link-layer header.
967 *
968 * Virtual lan types (bridge, vlan, bond) can call
969 * dlil_input_packet_list() with the same packet with the
970 * checksum flags set. Set a flag indicating that the
971 * adjustment has already been done.
972 */
973 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
974 /* adjustment has already been done */
975 } else if ((m->m_pkthdr.csum_flags &
976 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
977 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
978 int adj;
979 if (frame_header == NULL ||
980 frame_header < (char *)mbuf_datastart(m) ||
981 frame_header > (char *)m->m_data ||
982 (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
983 m->m_pkthdr.csum_rx_start) {
984 m->m_pkthdr.csum_data = 0;
985 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
986 hwcksum_in_invalidated++;
987 } else {
988 m->m_pkthdr.csum_rx_start -= adj;
989 }
990 /* make sure we don't adjust more than once */
991 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
992 }
993 if (clat_debug) {
994 pktap_input(ifp, protocol_family, m, frame_header);
995 }
996
997 if (m->m_flags & (M_BCAST | M_MCAST)) {
998 os_atomic_inc(&ifp->if_imcasts, relaxed);
999 }
1000
1001 /* run interface filters */
1002 error = dlil_interface_filters_input(ifp, &m,
1003 &frame_header, protocol_family, skip_bridge_filter);
1004 if (error != 0) {
1005 if (error != EJUSTRETURN) {
1006 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
1007 }
1008 goto next;
1009 }
1010 /*
1011 * A VLAN and Bond interface receives packets by attaching
1012 * a "protocol" to the underlying interface.
1013 * A promiscuous packet needs to be delivered to the
1014 * VLAN or Bond interface since:
1015 * - Bond interface member may not support setting the
1016 * MAC address, so packets are inherently "promiscuous"
1017 * - A VLAN or Bond interface could be members of a bridge,
1018 * where promiscuous packets correspond to other
1019 * devices that the bridge forwards packets to/from
1020 */
1021 if ((m->m_flags & M_PROMISC) != 0) {
1022 switch (protocol_family) {
1023 case PF_VLAN:
1024 case PF_BOND:
1025 /* VLAN and Bond get promiscuous packets */
1026 break;
1027 default:
1028 if (droptap_verbose > 0) {
1029 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_PROMISC, NULL, 0);
1030 } else {
1031 m_freem(m);
1032 }
1033 goto next;
1034 }
1035 }
1036
1037 /* Lookup the protocol attachment to this interface */
1038 if (protocol_family == 0) {
1039 ifproto = NULL;
1040 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
1041 (last_ifproto->protocol_family == protocol_family)) {
1042 VERIFY(ifproto == NULL);
1043 ifproto = last_ifproto;
1044 if_proto_ref(last_ifproto);
1045 } else {
1046 VERIFY(ifproto == NULL);
1047 ifnet_lock_shared(ifp);
1048 /* callee holds a proto refcnt upon success */
1049 ifproto = find_attached_proto(ifp, protocol_family);
1050 ifnet_lock_done(ifp);
1051 }
1052 if (ifproto == NULL) {
1053 /* no protocol for this packet, discard */
1054 m_drop_extended(m, ifp, frame_header, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0);
1055 goto next;
1056 }
1057 if (ifproto != last_ifproto) {
1058 if (last_ifproto != NULL) {
1059 /* pass up the list for the previous protocol */
1060 dlil_ifproto_input(last_ifproto, pkt_first);
1061 pkt_first = NULL;
1062 if_proto_free(last_ifproto);
1063 }
1064 last_ifproto = ifproto;
1065 if_proto_ref(ifproto);
1066 }
1067 /* extend the list */
1068 m->m_pkthdr.pkt_hdr = frame_header;
1069 if (pkt_first == NULL) {
1070 pkt_first = m;
1071 } else {
1072 *pkt_next = m;
1073 }
1074 pkt_next = &m->m_nextpkt;
1075
1076 next:
1077 if (next_packet == NULL && last_ifproto != NULL) {
1078 /* pass up the last list of packets */
1079 dlil_ifproto_input(last_ifproto, pkt_first);
1080 if_proto_free(last_ifproto);
1081 last_ifproto = NULL;
1082 }
1083 if (ifproto != NULL) {
1084 if_proto_free(ifproto);
1085 ifproto = NULL;
1086 }
1087
1088 m = next_packet;
1089
1090 /* update the driver's multicast filter, if needed */
1091 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
1092 ifp->if_updatemcasts = 0;
1093 }
1094 if (iorefcnt == 1) {
1095 /* If the next mbuf is on a different interface, unlock data-mov */
1096 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
1097 ifnet_datamov_end(ifp);
1098 iorefcnt = 0;
1099 }
1100 }
1101 }
1102
1103 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
1104 }
1105
1106 /*
1107 * Input thread for interfaces with legacy input model.
1108 */
1109 __attribute__((noreturn))
1110 static void
dlil_input_thread_func(void * v,wait_result_t w)1111 dlil_input_thread_func(void *v, wait_result_t w)
1112 {
1113 #pragma unused(w)
1114 char thread_name_storage[MAXTHREADNAMESIZE];
1115 const char *__null_terminated thread_name;
1116 dlil_threading_info_ref_t inp = v;
1117 ifnet_ref_t ifp = inp->dlth_ifp;
1118
1119 VERIFY(inp != dlil_main_input_thread);
1120 VERIFY(ifp != NULL);
1121 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
1122 !(ifp->if_xflags & IFXF_LEGACY));
1123 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
1124 !(ifp->if_xflags & IFXF_LEGACY));
1125 VERIFY(current_thread() == inp->dlth_thread);
1126
1127 /* construct the name for this thread, and then apply it */
1128 bzero(thread_name_storage, sizeof(thread_name_storage));
1129 thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1130 "dlil_input_%s", ifp->if_xname);
1131 thread_set_thread_name(inp->dlth_thread, thread_name);
1132
1133 #if CONFIG_THREAD_GROUPS
1134 if (IFNET_REQUIRES_CELL_GROUP(ifp)) {
1135 thread_group_join_cellular();
1136 }
1137 #endif /* CONFIG_THREAD_GROUPS */
1138
1139 lck_mtx_lock(&inp->dlth_lock);
1140 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1141 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1142 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1143 /* wake up once to get out of embryonic state */
1144 dlil_input_wakeup(inp);
1145 lck_mtx_unlock(&inp->dlth_lock);
1146 (void) thread_block_parameter(dlil_input_thread_cont, inp);
1147 /* NOTREACHED */
1148 __builtin_unreachable();
1149 }
1150
1151 __attribute__((noreturn))
1152 static void
dlil_input_thread_cont(void * v,wait_result_t wres)1153 dlil_input_thread_cont(void *v, wait_result_t wres)
1154 {
1155 dlil_threading_info_ref_t inp = v;
1156 ifnet_ref_t ifp = inp->dlth_ifp;
1157
1158 lck_mtx_lock_spin(&inp->dlth_lock);
1159 if (__improbable(wres == THREAD_INTERRUPTED ||
1160 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1161 goto terminate;
1162 }
1163
1164 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1165 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1166
1167 while (1) {
1168 struct mbuf *m = NULL;
1169 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1170 boolean_t notify = FALSE;
1171 boolean_t embryonic;
1172 u_int32_t m_cnt;
1173
1174 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1175
1176 if (__improbable(embryonic =
1177 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1178 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1179 }
1180
1181 /*
1182 * Protocol registration and injection must always use
1183 * the main input thread; in theory the latter can utilize
1184 * the corresponding input thread where the packet arrived
1185 * on, but that requires our knowing the interface in advance
1186 * (and the benefits might not worth the trouble.)
1187 */
1188 VERIFY(!(inp->dlth_flags &
1189 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1190
1191 /* Packets for this interface */
1192 m_cnt = qlen(&inp->dlth_pkts);
1193 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1194 m = pkt.cp_mbuf;
1195
1196 inp->dlth_wtot = 0;
1197
1198 #if SKYWALK
1199 /*
1200 * If this interface is attached to a netif nexus,
1201 * the stats are already incremented there; otherwise
1202 * do it here.
1203 */
1204 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
1205 #endif /* SKYWALK */
1206 notify = dlil_input_stats_sync(ifp, inp);
1207
1208 lck_mtx_unlock(&inp->dlth_lock);
1209
1210 if (__improbable(embryonic)) {
1211 ifnet_decr_pending_thread_count(ifp);
1212 }
1213
1214 if (__improbable(notify)) {
1215 ifnet_notify_data_threshold(ifp);
1216 }
1217
1218 /*
1219 * NOTE warning %%% attention !!!!
1220 * We should think about putting some thread starvation
1221 * safeguards if we deal with long chains of packets.
1222 */
1223 if (__probable(m != NULL)) {
1224 dlil_input_packet_list_extended(ifp, m,
1225 m_cnt, ifp->if_poll_mode);
1226 }
1227
1228 lck_mtx_lock_spin(&inp->dlth_lock);
1229 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1230 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1231 DLIL_INPUT_TERMINATE))) {
1232 break;
1233 }
1234 }
1235
1236 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1237
1238 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1239 terminate:
1240 lck_mtx_unlock(&inp->dlth_lock);
1241 dlil_terminate_input_thread(inp);
1242 /* NOTREACHED */
1243 } else {
1244 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1245 lck_mtx_unlock(&inp->dlth_lock);
1246 (void) thread_block_parameter(dlil_input_thread_cont, inp);
1247 /* NOTREACHED */
1248 }
1249
1250 VERIFY(0); /* we should never get here */
1251 /* NOTREACHED */
1252 __builtin_unreachable();
1253 }
1254
1255 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)1256 dlil_input_wakeup(struct dlil_threading_info *inp)
1257 {
1258 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
1259
1260 inp->dlth_flags |= DLIL_INPUT_WAITING;
1261 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
1262 inp->dlth_wtot++;
1263 wakeup_one((caddr_t)&inp->dlth_flags);
1264 }
1265 }
1266
1267 static int
dlil_interface_filters_input(struct ifnet * ifp,mbuf_ref_ref_t m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)1268 dlil_interface_filters_input(struct ifnet *ifp, mbuf_ref_ref_t m_p,
1269 char **frame_header_p, protocol_family_t protocol_family,
1270 boolean_t skip_bridge)
1271 {
1272 boolean_t is_vlan_packet = FALSE;
1273 struct ifnet_filter *filter;
1274 struct mbuf *m = *m_p;
1275
1276 is_vlan_packet = packet_has_vlan_tag(m);
1277
1278 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
1279 return 0;
1280 }
1281
1282 /*
1283 * Pass the inbound packet to the interface filters
1284 */
1285 lck_mtx_lock_spin(&ifp->if_flt_lock);
1286 /* prevent filter list from changing in case we drop the lock */
1287 if_flt_monitor_busy(ifp);
1288 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
1289 int result;
1290
1291 /* exclude VLAN packets from external filters PR-3586856 */
1292 if (is_vlan_packet &&
1293 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
1294 continue;
1295 }
1296 /* the bridge has already seen the packet */
1297 if (skip_bridge &&
1298 (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
1299 continue;
1300 }
1301 if (!filter->filt_skip && filter->filt_input != NULL &&
1302 (filter->filt_protocol == 0 ||
1303 filter->filt_protocol == protocol_family)) {
1304 lck_mtx_unlock(&ifp->if_flt_lock);
1305
1306 result = (*filter->filt_input)(filter->filt_cookie,
1307 ifp, protocol_family, m_p, frame_header_p);
1308
1309 lck_mtx_lock_spin(&ifp->if_flt_lock);
1310 if (result != 0) {
1311 /* we're done with the filter list */
1312 if_flt_monitor_unbusy(ifp);
1313 lck_mtx_unlock(&ifp->if_flt_lock);
1314 return result;
1315 }
1316 }
1317 }
1318 /* we're done with the filter list */
1319 if_flt_monitor_unbusy(ifp);
1320 lck_mtx_unlock(&ifp->if_flt_lock);
1321
1322 /*
1323 * Strip away M_PROTO1 bit prior to sending packet up the stack as
1324 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
1325 */
1326 if (*m_p != NULL) {
1327 (*m_p)->m_flags &= ~M_PROTO1;
1328 }
1329
1330 return 0;
1331 }
1332
1333 __attribute__((noreturn))
1334 static void
dlil_main_input_thread_func(void * v,wait_result_t w)1335 dlil_main_input_thread_func(void *v, wait_result_t w)
1336 {
1337 #pragma unused(w)
1338 dlil_threading_info_ref_t inp = v;
1339
1340 VERIFY(inp == dlil_main_input_thread);
1341 VERIFY(inp->dlth_ifp == NULL);
1342 VERIFY(current_thread() == inp->dlth_thread);
1343
1344 lck_mtx_lock(&inp->dlth_lock);
1345 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1346 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1347 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1348 /* wake up once to get out of embryonic state */
1349 dlil_input_wakeup(inp);
1350 lck_mtx_unlock(&inp->dlth_lock);
1351 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1352 /* NOTREACHED */
1353 __builtin_unreachable();
1354 }
1355
1356 /*
1357 * Main input thread:
1358 *
1359 * a) handles all inbound packets for lo0
1360 * b) handles all inbound packets for interfaces with no dedicated
1361 * input thread (e.g. anything but Ethernet/PDP or those that support
1362 * opportunistic polling.)
1363 * c) protocol registrations
1364 * d) packet injections
1365 */
1366 __attribute__((noreturn))
1367 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)1368 dlil_main_input_thread_cont(void *v, wait_result_t wres)
1369 {
1370 dlil_main_threading_info_ref_t inpm = v;
1371 dlil_threading_info_ref_t inp = v;
1372
1373 /* main input thread is uninterruptible */
1374 VERIFY(wres != THREAD_INTERRUPTED);
1375 lck_mtx_lock_spin(&inp->dlth_lock);
1376 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
1377 DLIL_INPUT_RUNNING)));
1378 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1379
1380 while (1) {
1381 struct mbuf *m = NULL, *m_loop = NULL;
1382 u_int32_t m_cnt, m_cnt_loop;
1383 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1384 boolean_t proto_req;
1385 boolean_t embryonic;
1386
1387 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1388
1389 if (__improbable(embryonic =
1390 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1391 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1392 }
1393
1394 proto_req = (inp->dlth_flags &
1395 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1396
1397 /* Packets for non-dedicated interfaces other than lo0 */
1398 m_cnt = qlen(&inp->dlth_pkts);
1399 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1400 m = pkt.cp_mbuf;
1401
1402 /* Packets exclusive to lo0 */
1403 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1404 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
1405 m_loop = pkt.cp_mbuf;
1406
1407 inp->dlth_wtot = 0;
1408
1409 lck_mtx_unlock(&inp->dlth_lock);
1410
1411 if (__improbable(embryonic)) {
1412 dlil_decr_pending_thread_count();
1413 }
1414
1415 /*
1416 * NOTE warning %%% attention !!!!
1417 * We should think about putting some thread starvation
1418 * safeguards if we deal with long chains of packets.
1419 */
1420 if (__probable(m_loop != NULL)) {
1421 dlil_input_packet_list_extended(lo_ifp, m_loop,
1422 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
1423 }
1424
1425 if (__probable(m != NULL)) {
1426 dlil_input_packet_list_extended(NULL, m,
1427 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
1428 }
1429
1430 if (__improbable(proto_req)) {
1431 proto_input_run();
1432 }
1433
1434 lck_mtx_lock_spin(&inp->dlth_lock);
1435 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1436 /* main input thread cannot be terminated */
1437 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
1438 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
1439 break;
1440 }
1441 }
1442
1443 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1444 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1445 lck_mtx_unlock(&inp->dlth_lock);
1446 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1447
1448 VERIFY(0); /* we should never get here */
1449 /* NOTREACHED */
1450 __builtin_unreachable();
1451 }
1452
1453 /*
1454 * Input thread for interfaces with opportunistic polling input model.
1455 */
1456 __attribute__((noreturn))
1457 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)1458 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1459 {
1460 #pragma unused(w)
1461 char thread_name_storage[MAXTHREADNAMESIZE];
1462 const char *__null_terminated thread_name;
1463 dlil_threading_info_ref_t inp = v;
1464 ifnet_ref_t ifp = inp->dlth_ifp;
1465
1466 VERIFY(inp != dlil_main_input_thread);
1467 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
1468 (ifp->if_xflags & IFXF_LEGACY));
1469 VERIFY(current_thread() == inp->dlth_thread);
1470
1471 /* construct the name for this thread, and then apply it */
1472 bzero(thread_name_storage, sizeof(thread_name_storage));
1473 thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1474 "dlil_input_poll_%s", ifp->if_xname);
1475 thread_set_thread_name(inp->dlth_thread, thread_name);
1476
1477 lck_mtx_lock(&inp->dlth_lock);
1478 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1479 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1480 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1481 /* wake up once to get out of embryonic state */
1482 dlil_input_wakeup(inp);
1483 lck_mtx_unlock(&inp->dlth_lock);
1484 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
1485 /* NOTREACHED */
1486 __builtin_unreachable();
1487 }
1488
1489 __attribute__((noreturn))
1490 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)1491 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
1492 {
1493 dlil_threading_info_ref_t inp = v;
1494 ifnet_ref_t ifp = inp->dlth_ifp;
1495 struct timespec ts;
1496
1497 lck_mtx_lock_spin(&inp->dlth_lock);
1498 if (__improbable(wres == THREAD_INTERRUPTED ||
1499 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1500 goto terminate;
1501 }
1502
1503 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1504 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1505
1506 while (1) {
1507 struct mbuf *m = NULL;
1508 uint32_t m_cnt, poll_req = 0;
1509 uint64_t m_size = 0;
1510 ifnet_model_t mode;
1511 struct timespec now, delta;
1512 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1513 boolean_t notify;
1514 boolean_t embryonic;
1515 uint64_t ival;
1516
1517 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1518
1519 if (__improbable(embryonic =
1520 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1521 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1522 goto skip;
1523 }
1524
1525 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
1526 ival = IF_RXPOLL_INTERVALTIME_MIN;
1527 }
1528
1529 /* Link parameters changed? */
1530 if (ifp->if_poll_update != 0) {
1531 ifp->if_poll_update = 0;
1532 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1533 }
1534
1535 /* Current operating mode */
1536 mode = ifp->if_poll_mode;
1537
1538 /*
1539 * Protocol registration and injection must always use
1540 * the main input thread; in theory the latter can utilize
1541 * the corresponding input thread where the packet arrived
1542 * on, but that requires our knowing the interface in advance
1543 * (and the benefits might not worth the trouble.)
1544 */
1545 VERIFY(!(inp->dlth_flags &
1546 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1547
1548 /* Total count of all packets */
1549 m_cnt = qlen(&inp->dlth_pkts);
1550
1551 /* Total bytes of all packets */
1552 m_size = qsize(&inp->dlth_pkts);
1553
1554 /* Packets for this interface */
1555 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1556 m = pkt.cp_mbuf;
1557 VERIFY(m != NULL || m_cnt == 0);
1558
1559 nanouptime(&now);
1560 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
1561 *(&ifp->if_poll_sample_lasttime) = *(&now);
1562 }
1563
1564 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
1565 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
1566 u_int32_t ptot, btot;
1567
1568 /* Accumulate statistics for current sampling */
1569 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
1570
1571 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
1572 goto skip;
1573 }
1574
1575 *(&ifp->if_poll_sample_lasttime) = *(&now);
1576
1577 /* Calculate min/max of inbound bytes */
1578 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
1579 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
1580 ifp->if_rxpoll_bmin = btot;
1581 }
1582 if (btot > ifp->if_rxpoll_bmax) {
1583 ifp->if_rxpoll_bmax = btot;
1584 }
1585
1586 /* Calculate EWMA of inbound bytes */
1587 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
1588
1589 /* Calculate min/max of inbound packets */
1590 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
1591 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
1592 ifp->if_rxpoll_pmin = ptot;
1593 }
1594 if (ptot > ifp->if_rxpoll_pmax) {
1595 ifp->if_rxpoll_pmax = ptot;
1596 }
1597
1598 /* Calculate EWMA of inbound packets */
1599 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
1600
1601 /* Reset sampling statistics */
1602 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
1603
1604 /* Calculate EWMA of wakeup requests */
1605 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
1606 if_rxpoll_decay);
1607 inp->dlth_wtot = 0;
1608
1609 if (dlil_verbose) {
1610 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
1611 *(&ifp->if_poll_dbg_lasttime) = *(&now);
1612 }
1613 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
1614 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1615 *(&ifp->if_poll_dbg_lasttime) = *(&now);
1616 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
1617 "limits [%d/%d], wreq avg %d "
1618 "limits [%d/%d], bytes avg %d "
1619 "limits [%d/%d]\n", if_name(ifp),
1620 (ifp->if_poll_mode ==
1621 IFNET_MODEL_INPUT_POLL_ON) ?
1622 "ON" : "OFF", ifp->if_rxpoll_pavg,
1623 ifp->if_rxpoll_pmax,
1624 ifp->if_rxpoll_plowat,
1625 ifp->if_rxpoll_phiwat,
1626 ifp->if_rxpoll_wavg,
1627 ifp->if_rxpoll_wlowat,
1628 ifp->if_rxpoll_whiwat,
1629 ifp->if_rxpoll_bavg,
1630 ifp->if_rxpoll_blowat,
1631 ifp->if_rxpoll_bhiwat);
1632 }
1633 }
1634
1635 /* Perform mode transition, if necessary */
1636 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
1637 *(&ifp->if_poll_mode_lasttime) = *(&now);
1638 }
1639
1640 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
1641 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
1642 goto skip;
1643 }
1644
1645 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
1646 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
1647 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
1648 mode = IFNET_MODEL_INPUT_POLL_OFF;
1649 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
1650 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
1651 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
1652 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
1653 mode = IFNET_MODEL_INPUT_POLL_ON;
1654 }
1655
1656 if (mode != ifp->if_poll_mode) {
1657 ifp->if_poll_mode = mode;
1658 *(&ifp->if_poll_mode_lasttime) = *(&now);
1659 poll_req++;
1660 }
1661 }
1662 skip:
1663 notify = dlil_input_stats_sync(ifp, inp);
1664
1665 lck_mtx_unlock(&inp->dlth_lock);
1666
1667 if (__improbable(embryonic)) {
1668 ifnet_decr_pending_thread_count(ifp);
1669 }
1670
1671 if (__improbable(notify)) {
1672 ifnet_notify_data_threshold(ifp);
1673 }
1674
1675 /*
1676 * If there's a mode change and interface is still attached,
1677 * perform a downcall to the driver for the new mode. Also
1678 * hold an IO refcnt on the interface to prevent it from
1679 * being detached (will be release below.)
1680 */
1681 if (poll_req != 0 && ifnet_get_ioref(ifp)) {
1682 struct ifnet_model_params p = {
1683 .model = mode, .reserved = { 0 }
1684 };
1685 errno_t err;
1686
1687 if (dlil_verbose) {
1688 DLIL_PRINTF("%s: polling is now %s, "
1689 "pkts avg %d max %d limits [%d/%d], "
1690 "wreq avg %d limits [%d/%d], "
1691 "bytes avg %d limits [%d/%d]\n",
1692 if_name(ifp),
1693 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1694 "ON" : "OFF", ifp->if_rxpoll_pavg,
1695 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
1696 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
1697 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
1698 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
1699 ifp->if_rxpoll_bhiwat);
1700 }
1701
1702 if ((err = ((*ifp->if_input_ctl)(ifp,
1703 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
1704 DLIL_PRINTF("%s: error setting polling mode "
1705 "to %s (%d)\n", if_name(ifp),
1706 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1707 "ON" : "OFF", err);
1708 }
1709
1710 switch (mode) {
1711 case IFNET_MODEL_INPUT_POLL_OFF:
1712 ifnet_set_poll_cycle(ifp, NULL);
1713 ifp->if_rxpoll_offreq++;
1714 if (err != 0) {
1715 ifp->if_rxpoll_offerr++;
1716 }
1717 break;
1718
1719 case IFNET_MODEL_INPUT_POLL_ON:
1720 net_nsectimer(&ival, &ts);
1721 ifnet_set_poll_cycle(ifp, &ts);
1722 ifnet_poll(ifp);
1723 ifp->if_rxpoll_onreq++;
1724 if (err != 0) {
1725 ifp->if_rxpoll_onerr++;
1726 }
1727 break;
1728
1729 default:
1730 VERIFY(0);
1731 /* NOTREACHED */
1732 }
1733
1734 /* Release the IO refcnt */
1735 ifnet_decr_iorefcnt(ifp);
1736 }
1737
1738 /*
1739 * NOTE warning %%% attention !!!!
1740 * We should think about putting some thread starvation
1741 * safeguards if we deal with long chains of packets.
1742 */
1743 if (__probable(m != NULL)) {
1744 dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
1745 }
1746
1747 lck_mtx_lock_spin(&inp->dlth_lock);
1748 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1749 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1750 DLIL_INPUT_TERMINATE))) {
1751 break;
1752 }
1753 }
1754
1755 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1756
1757 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1758 terminate:
1759 lck_mtx_unlock(&inp->dlth_lock);
1760 dlil_terminate_input_thread(inp);
1761 /* NOTREACHED */
1762 } else {
1763 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1764 lck_mtx_unlock(&inp->dlth_lock);
1765 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
1766 inp);
1767 /* NOTREACHED */
1768 }
1769
1770 VERIFY(0); /* we should never get here */
1771 /* NOTREACHED */
1772 __builtin_unreachable();
1773 }
1774
1775 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)1776 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
1777 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
1778 {
1779 uint32_t overcommitted_qlen; /* Length in packets. */
1780 uint64_t overcommitted_qsize; /* Size in bytes. */
1781 uint32_t target_qlen; /* The desired queue length after trimming. */
1782 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
1783 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
1784 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
1785 struct mbuf *m = NULL, *m_tmp = NULL;
1786
1787 overcommitted_qlen = qlen(input_queue);
1788 overcommitted_qsize = qsize(input_queue);
1789 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
1790
1791 if (overcommitted_qlen <= target_qlen) {
1792 /*
1793 * The queue is already within the target limits.
1794 */
1795 dropped_pkts = 0;
1796 goto out;
1797 }
1798
1799 pkts_to_drop = overcommitted_qlen - target_qlen;
1800
1801 /*
1802 * Proceed to removing packets from the head of the queue,
1803 * starting from the oldest, until the desired number of packets
1804 * has been dropped.
1805 */
1806 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
1807 if (pkts_to_drop <= dropped_pkts) {
1808 break;
1809 }
1810 MBUFQ_REMOVE(&qmbufq(input_queue), m);
1811 MBUFQ_NEXT(m) = NULL;
1812 MBUFQ_ENQUEUE(freeq, m);
1813
1814 dropped_pkts += 1;
1815 dropped_bytes += m_length(m);
1816 }
1817
1818 /*
1819 * Adjust the length and the estimated size of the queue
1820 * after trimming.
1821 */
1822 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
1823 qlen(input_queue) = target_qlen;
1824
1825 /* qsize() is an approximation. */
1826 if (dropped_bytes < qsize(input_queue)) {
1827 qsize(input_queue) -= dropped_bytes;
1828 } else {
1829 qsize(input_queue) = 0;
1830 }
1831
1832 /*
1833 * Adjust the ifnet statistics increments, if needed.
1834 */
1835 stat_delta->dropped += dropped_pkts;
1836 if (dropped_pkts < stat_delta->packets_in) {
1837 stat_delta->packets_in -= dropped_pkts;
1838 } else {
1839 stat_delta->packets_in = 0;
1840 }
1841 if (dropped_bytes < stat_delta->bytes_in) {
1842 stat_delta->bytes_in -= dropped_bytes;
1843 } else {
1844 stat_delta->bytes_in = 0;
1845 }
1846
1847 out:
1848 if (dlil_verbose) {
1849 /*
1850 * The basic information about the drop is logged
1851 * by the invoking function (dlil_input_{,a}sync).
1852 * If `dlil_verbose' flag is set, provide more information
1853 * that can be useful for debugging.
1854 */
1855 DLIL_PRINTF("%s: "
1856 "qlen: %u -> %u, "
1857 "qsize: %llu -> %llu "
1858 "qlimit: %u (sysctl: %u) "
1859 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
1860 "dropped_pkts: %u dropped_bytes %u\n",
1861 __func__,
1862 overcommitted_qlen, qlen(input_queue),
1863 overcommitted_qsize, qsize(input_queue),
1864 qlimit(input_queue), if_rcvq_burst_limit,
1865 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
1866 dropped_pkts, dropped_bytes);
1867 }
1868
1869 return dropped_pkts;
1870 }
1871
1872 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)1873 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
1874 {
1875 lck_mtx_lock_spin(&ifp->if_flt_lock);
1876 if_flt_monitor_busy(ifp);
1877 lck_mtx_unlock(&ifp->if_flt_lock);
1878
1879 if (ifp->if_bridge != NULL) {
1880 m = bridge_early_input(ifp, m, cnt);
1881 }
1882 lck_mtx_lock_spin(&ifp->if_flt_lock);
1883 if_flt_monitor_unbusy(ifp);
1884 lck_mtx_unlock(&ifp->if_flt_lock);
1885 return m;
1886 }
1887