1 /*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <net/if_var.h>
30 #include <net/dlil_var_private.h>
31 #include <net/dlil.h>
32 #include <net/dlil_sysctl.h>
33
34
35 #define DLIL_EWMA(old, new, decay) do { \
36 u_int32_t _avg; \
37 if ((_avg = (old)) > 0) \
38 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
39 else \
40 _avg = (new); \
41 (old) = _avg; \
42 } while (0)
43
44
45 /*
46 * Detect whether a queue contains a burst that needs to be trimmed.
47 */
48 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
49 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
50 qtype(q) == QP_MBUF)
51
52
53 /* rate limit debug messages */
54 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
55
56 extern void proto_input_run(void);
57
58 static errno_t dlil_input_async(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
59 static errno_t dlil_input_sync(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
60 static void dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, protocol_family_t pf);
61 static void dlil_input_packet_list_common(struct ifnet *, mbuf_ref_t, u_int32_t, ifnet_model_t, boolean_t);
62 static void dlil_input_thread_func(void *, wait_result_t);
63 static void dlil_input_thread_cont(void *, wait_result_t);
64 static inline void dlil_input_wakeup(struct dlil_threading_info *inp);
65
66 static int dlil_interface_filters_input(struct ifnet *, mbuf_ref_ref_t, char **, protocol_family_t, boolean_t);
67
68 static void dlil_main_input_thread_func(void *, wait_result_t);
69 static void dlil_main_input_thread_cont(void *, wait_result_t);
70
71 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
72 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
73
74 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue, dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta);
75
76 static inline mbuf_t handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt);
77 /*
78 * Publicly visible functions.
79 */
80
81 int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)82 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
83 thread_continue_t *thfunc)
84 {
85 boolean_t dlil_rxpoll_input;
86 thread_continue_t func = NULL;
87 u_int32_t limit;
88 int error = 0;
89
90 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
91 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
92
93 /* default strategy utilizes the DLIL worker thread */
94 inp->dlth_strategy = dlil_input_async;
95
96 /* NULL ifp indicates the main input thread, called at dlil_init time */
97 if (ifp == NULL) {
98 /*
99 * Main input thread only.
100 */
101 func = dlil_main_input_thread_func;
102 VERIFY(inp == dlil_main_input_thread);
103 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
104 "main_input");
105 } else if (dlil_rxpoll_input) {
106 /*
107 * Legacy (non-netif) hybrid polling.
108 */
109 func = dlil_rxpoll_input_thread_func;
110 VERIFY(inp != dlil_main_input_thread);
111 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
112 "%s_input_poll", if_name(ifp));
113 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
114 /*
115 * Asynchronous strategy.
116 */
117 func = dlil_input_thread_func;
118 VERIFY(inp != dlil_main_input_thread);
119 inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
120 "%s_input", if_name(ifp));
121 } else {
122 /*
123 * Synchronous strategy if there's a netif below and
124 * the device isn't capable of hybrid polling.
125 */
126 ASSERT(func == NULL);
127 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
128 VERIFY(inp != dlil_main_input_thread);
129 ASSERT(!inp->dlth_affinity);
130 inp->dlth_strategy = dlil_input_sync;
131 inp->dlth_name = __unsafe_null_terminated_from_indexable(inp->dlth_name_storage);
132 }
133 VERIFY(inp->dlth_thread == THREAD_NULL);
134
135 /* let caller know */
136 if (thfunc != NULL) {
137 *thfunc = func;
138 }
139
140 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
141 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
142
143 inp->dlth_ifp = ifp; /* NULL for main input thread */
144
145 /*
146 * For interfaces that support opportunistic polling, set the
147 * low and high watermarks for outstanding inbound packets/bytes.
148 * Also define freeze times for transitioning between modes
149 * and updating the average.
150 */
151 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
152 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
153 if (ifp->if_xflags & IFXF_LEGACY) {
154 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
155 }
156 } else {
157 /*
158 * For interfaces that don't support opportunistic
159 * polling, set the burst limit to prevent memory exhaustion.
160 * The values of `if_rcvq_burst_limit' are safeguarded
161 * on customer builds by `sysctl_rcvq_burst_limit'.
162 */
163 limit = if_rcvq_burst_limit;
164 }
165
166 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
167 if (inp == dlil_main_input_thread) {
168 dlil_main_threading_info_ref_t inpm =
169 __container_of(inp, struct dlil_main_threading_info, inp);
170 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
171 }
172
173 if (func == NULL) {
174 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
175 ASSERT(error == 0);
176 error = ENODEV;
177 goto done;
178 }
179
180 error = kernel_thread_start(func, inp, &inp->dlth_thread);
181 if (error == KERN_SUCCESS) {
182 thread_precedence_policy_data_t info;
183 __unused kern_return_t kret;
184
185 bzero(&info, sizeof(info));
186 info.importance = 0;
187 kret = thread_policy_set(inp->dlth_thread,
188 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
189 THREAD_PRECEDENCE_POLICY_COUNT);
190 ASSERT(kret == KERN_SUCCESS);
191 /*
192 * We create an affinity set so that the matching workloop
193 * thread or the starter thread (for loopback) can be
194 * scheduled on the same processor set as the input thread.
195 */
196 if (net_affinity) {
197 struct thread *tp __single = inp->dlth_thread;
198 u_int32_t tag;
199 /*
200 * Randomize to reduce the probability
201 * of affinity tag namespace collision.
202 */
203 read_frandom(&tag, sizeof(tag));
204 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
205 thread_reference(tp);
206 inp->dlth_affinity_tag = tag;
207 inp->dlth_affinity = TRUE;
208 }
209 }
210 } else if (inp == dlil_main_input_thread) {
211 panic_plain("%s: couldn't create main input thread", __func__);
212 /* NOTREACHED */
213 } else {
214 panic_plain("%s: couldn't create %s input thread", __func__,
215 if_name(ifp));
216 /* NOTREACHED */
217 }
218 OSAddAtomic(1, &cur_dlil_input_threads);
219
220 done:
221 return error;
222 }
223
224 void
dlil_terminate_input_thread(struct dlil_threading_info * inp)225 dlil_terminate_input_thread(struct dlil_threading_info *inp)
226 {
227 ifnet_ref_t ifp = inp->dlth_ifp;
228 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
229
230 VERIFY(current_thread() == inp->dlth_thread);
231 VERIFY(inp != dlil_main_input_thread);
232
233 OSAddAtomic(-1, &cur_dlil_input_threads);
234
235 #if TEST_INPUT_THREAD_TERMINATION
236 { /* do something useless that won't get optimized away */
237 uint32_t v = 1;
238 for (uint32_t i = 0;
239 i < if_input_thread_termination_spin;
240 i++) {
241 v = (i + 1) * v;
242 }
243 DLIL_PRINTF("the value is %d\n", v);
244 }
245 #endif /* TEST_INPUT_THREAD_TERMINATION */
246
247 lck_mtx_lock_spin(&inp->dlth_lock);
248 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
249 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
250 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
251 wakeup_one((caddr_t)&inp->dlth_flags);
252 lck_mtx_unlock(&inp->dlth_lock);
253
254 /* free up pending packets */
255 if (pkt.cp_mbuf != NULL) {
256 mbuf_freem_list(pkt.cp_mbuf);
257 }
258
259 /* for the extra refcnt from kernel_thread_start() */
260 thread_deallocate(current_thread());
261
262 if (dlil_verbose) {
263 DLIL_PRINTF("%s: input thread terminated\n",
264 if_name(ifp));
265 }
266
267 /* this is the end */
268 thread_terminate(current_thread());
269 /* NOTREACHED */
270 }
271
272 boolean_t
dlil_is_rxpoll_input(thread_continue_t func)273 dlil_is_rxpoll_input(thread_continue_t func)
274 {
275 return func == dlil_rxpoll_input_thread_func;
276 }
277
278 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)279 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
280 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
281 boolean_t poll, struct thread *tp)
282 {
283 dlil_threading_info_ref_t inp = ifp->if_inp;
284
285 if (__improbable(inp == NULL)) {
286 inp = dlil_main_input_thread;
287 }
288
289 #if (DEVELOPMENT || DEBUG)
290 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
291 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
292 } else
293 #endif /* (DEVELOPMENT || DEBUG) */
294 {
295 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
296 }
297 }
298
299 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)300 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
301 {
302 return dlil_input_packet_list_common(ifp, m, 0,
303 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
304 }
305
306 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)307 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
308 u_int32_t cnt, ifnet_model_t mode)
309 {
310 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
311 }
312
313 /*
314 * Static function implementations.
315 */
316 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_ref_t m)317 dlil_ifproto_input(struct if_proto * ifproto, mbuf_ref_t m)
318 {
319 int error;
320
321 if (ifproto->proto_kpi == kProtoKPI_v1) {
322 /* Version 1 protocols get one packet at a time */
323 while (m != NULL) {
324 /*
325 * Version 1 KPI does not accept header len,
326 * hence the pointer to the frame header must be `__single'.
327 */
328 char *frame_header_ptr __single;
329
330 mbuf_t next_packet;
331
332 next_packet = m->m_nextpkt;
333 m->m_nextpkt = NULL;
334 frame_header_ptr = m->m_pkthdr.pkt_hdr;
335
336 m->m_pkthdr.pkt_hdr = NULL;
337 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
338 ifproto->protocol_family, m, frame_header_ptr);
339 if (error != 0 && error != EJUSTRETURN) {
340 m_drop_if(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
341 }
342 m = next_packet;
343 }
344 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
345 /* Version 2 protocols support packet lists */
346 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
347 ifproto->protocol_family, m);
348 if (error != 0 && error != EJUSTRETURN) {
349 m_drop_list(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
350 }
351 }
352 }
353
354 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)355 dlil_input_async(struct dlil_threading_info *inp,
356 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
357 const struct ifnet_stat_increment_param *s, boolean_t poll,
358 struct thread *tp)
359 {
360 u_int32_t m_cnt = s->packets_in;
361 u_int32_t m_size = s->bytes_in;
362 boolean_t notify = FALSE;
363 struct ifnet_stat_increment_param s_adj = *s;
364 dlil_freeq_t freeq;
365 MBUFQ_INIT(&freeq);
366
367 /*
368 * If there is a matching DLIL input thread associated with an
369 * affinity set, associate this thread with the same set. We
370 * will only do this once.
371 */
372 lck_mtx_lock_spin(&inp->dlth_lock);
373 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
374 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
375 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
376 u_int32_t tag = inp->dlth_affinity_tag;
377
378 if (poll) {
379 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
380 inp->dlth_poller_thread = tp;
381 } else {
382 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
383 inp->dlth_driver_thread = tp;
384 }
385 lck_mtx_unlock(&inp->dlth_lock);
386
387 /* Associate the current thread with the new affinity tag */
388 (void) dlil_affinity_set(tp, tag);
389
390 /*
391 * Take a reference on the current thread; during detach,
392 * we will need to refer to it in order to tear down its
393 * affinity.
394 */
395 thread_reference(tp);
396 lck_mtx_lock_spin(&inp->dlth_lock);
397 }
398
399 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
400
401 /*
402 * Because of loopbacked multicast we cannot stuff the ifp in
403 * the rcvif of the packet header: loopback (lo0) packets use a
404 * dedicated list so that we can later associate them with lo_ifp
405 * on their way up the stack. Packets for other interfaces without
406 * dedicated input threads go to the regular list.
407 */
408 if (m_head != NULL) {
409 classq_pkt_t head, tail;
410 class_queue_t *input_queue;
411 CLASSQ_PKT_INIT_MBUF(&head, m_head);
412 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
413 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
414 dlil_main_threading_info_ref_t inpm =
415 __container_of(inp, struct dlil_main_threading_info, inp);
416 input_queue = &inpm->lo_rcvq_pkts;
417 } else {
418 input_queue = &inp->dlth_pkts;
419 }
420
421 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
422
423 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
424 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
425 inp->dlth_trim_pkts_dropped += s_adj.dropped;
426 inp->dlth_trim_cnt += 1;
427
428 os_log_error(OS_LOG_DEFAULT,
429 "%s %s burst limit %u (sysctl: %u) exceeded. "
430 "%u packets dropped [%u total in %u events]. new qlen %u ",
431 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
432 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
433 qlen(input_queue));
434 }
435 }
436
437 #if IFNET_INPUT_SANITY_CHK
438 /*
439 * Verify that the original stat increment parameter
440 * accurately describes the input chain `m_head`.
441 * This is not affected by the trimming of input queue.
442 */
443 if (__improbable(dlil_input_sanity_check != 0)) {
444 u_int32_t count = 0, size = 0;
445 struct mbuf *m0;
446
447 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
448 m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
449 size += m_length(m0);
450 count++;
451 }
452
453 if (count != m_cnt) {
454 panic_plain("%s: invalid total packet count %u "
455 "(expected %u)\n", if_name(ifp), count, m_cnt);
456 /* NOTREACHED */
457 __builtin_unreachable();
458 } else if (size != m_size) {
459 panic_plain("%s: invalid total packet size %u "
460 "(expected %u)\n", if_name(ifp), size, m_size);
461 /* NOTREACHED */
462 __builtin_unreachable();
463 }
464
465 inp->dlth_pkts_cnt += m_cnt;
466 }
467 #else
468 m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
469 #endif /* IFNET_INPUT_SANITY_CHK */
470
471 /* NOTE: use the adjusted parameter, vs the original one */
472 dlil_input_stats_add(&s_adj, inp, ifp, poll);
473 /*
474 * If we're using the main input thread, synchronize the
475 * stats now since we have the interface context. All
476 * other cases involving dedicated input threads will
477 * have their stats synchronized there.
478 */
479 if (inp == dlil_main_input_thread) {
480 notify = dlil_input_stats_sync(ifp, inp);
481 }
482
483 dlil_input_wakeup(inp);
484 lck_mtx_unlock(&inp->dlth_lock);
485
486 /*
487 * Actual freeing of the excess packets must happen
488 * after the dlth_lock had been released.
489 */
490 if (!MBUFQ_EMPTY(&freeq)) {
491 m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
492 }
493
494 if (notify) {
495 ifnet_notify_data_threshold(ifp);
496 }
497
498 return 0;
499 }
500
501 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)502 dlil_input_sync(struct dlil_threading_info *inp,
503 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
504 const struct ifnet_stat_increment_param *s, boolean_t poll,
505 struct thread *tp)
506 {
507 #pragma unused(tp)
508 u_int32_t m_cnt = s->packets_in;
509 u_int32_t m_size = s->bytes_in;
510 boolean_t notify = FALSE;
511 classq_pkt_t head, tail;
512 struct ifnet_stat_increment_param s_adj = *s;
513 dlil_freeq_t freeq;
514 MBUFQ_INIT(&freeq);
515
516 ASSERT(inp != dlil_main_input_thread);
517
518 /* XXX: should we just assert instead? */
519 if (__improbable(m_head == NULL)) {
520 return 0;
521 }
522
523 CLASSQ_PKT_INIT_MBUF(&head, m_head);
524 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
525
526 lck_mtx_lock_spin(&inp->dlth_lock);
527 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
528
529 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
530 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
531 inp->dlth_trim_pkts_dropped += s_adj.dropped;
532 inp->dlth_trim_cnt += 1;
533
534 os_log_error(OS_LOG_DEFAULT,
535 "%s %s burst limit %u (sysctl: %u) exceeded. "
536 "%u packets dropped [%u total in %u events]. new qlen %u \n",
537 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
538 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
539 qlen(&inp->dlth_pkts));
540 }
541
542 #if IFNET_INPUT_SANITY_CHK
543 if (__improbable(dlil_input_sanity_check != 0)) {
544 u_int32_t count = 0, size = 0;
545 struct mbuf *m0;
546
547 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
548 m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
549 size += m_length(m0);
550 count++;
551 }
552
553 if (count != m_cnt) {
554 panic_plain("%s: invalid total packet count %u "
555 "(expected %u)\n", if_name(ifp), count, m_cnt);
556 /* NOTREACHED */
557 __builtin_unreachable();
558 } else if (size != m_size) {
559 panic_plain("%s: invalid total packet size %u "
560 "(expected %u)\n", if_name(ifp), size, m_size);
561 /* NOTREACHED */
562 __builtin_unreachable();
563 }
564
565 inp->dlth_pkts_cnt += m_cnt;
566 }
567 #else
568 m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
569 #endif /* IFNET_INPUT_SANITY_CHK */
570
571 /* NOTE: use the adjusted parameter, vs the original one */
572 dlil_input_stats_add(&s_adj, inp, ifp, poll);
573
574 m_cnt = qlen(&inp->dlth_pkts);
575 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
576
577 #if SKYWALK
578 /*
579 * If this interface is attached to a netif nexus,
580 * the stats are already incremented there; otherwise
581 * do it here.
582 */
583 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
584 #endif /* SKYWALK */
585 notify = dlil_input_stats_sync(ifp, inp);
586
587 lck_mtx_unlock(&inp->dlth_lock);
588
589 /*
590 * Actual freeing of the excess packets must happen
591 * after the dlth_lock had been released.
592 */
593 if (!MBUFQ_EMPTY(&freeq)) {
594 m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
595 }
596
597 if (notify) {
598 ifnet_notify_data_threshold(ifp);
599 }
600
601 /*
602 * NOTE warning %%% attention !!!!
603 * We should think about putting some thread starvation
604 * safeguards if we deal with long chains of packets.
605 */
606 if (head.cp_mbuf != NULL) {
607 dlil_input_packet_list_extended(ifp, head.cp_mbuf,
608 m_cnt, ifp->if_poll_mode);
609 }
610
611 return 0;
612 }
613
614 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)615 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
616 protocol_family_t pf)
617 {
618 uint16_t sum = 0;
619 uint32_t hlen;
620
621 if (frame_header == NULL ||
622 frame_header < (char *)mbuf_datastart(m) ||
623 frame_header > (char *)m->m_data) {
624 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
625 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
626 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
627 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
628 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
629 (uint64_t)VM_KERNEL_ADDRPERM(m));
630 return;
631 }
632 hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
633
634 switch (pf) {
635 case PF_INET:
636 case PF_INET6:
637 break;
638 default:
639 return;
640 }
641
642 /*
643 * Force partial checksum offload; useful to simulate cases
644 * where the hardware does not support partial checksum offload,
645 * in order to validate correctness throughout the layers above.
646 */
647 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
648 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
649
650 if (foff > (uint32_t)m->m_pkthdr.len) {
651 return;
652 }
653
654 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
655
656 /* Compute 16-bit 1's complement sum from forced offset */
657 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
658
659 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
660 m->m_pkthdr.csum_rx_val = sum;
661 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
662
663 hwcksum_dbg_partial_forced++;
664 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
665 }
666
667 /*
668 * Partial checksum offload verification (and adjustment);
669 * useful to validate and test cases where the hardware
670 * supports partial checksum offload.
671 */
672 if ((m->m_pkthdr.csum_flags &
673 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
674 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
675 uint32_t rxoff;
676
677 /* Start offset must begin after frame header */
678 rxoff = m->m_pkthdr.csum_rx_start;
679 if (hlen > rxoff) {
680 hwcksum_dbg_bad_rxoff++;
681 if (dlil_verbose) {
682 DLIL_PRINTF("%s: partial cksum start offset %d "
683 "is less than frame header length %d for "
684 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
685 (uint64_t)VM_KERNEL_ADDRPERM(m));
686 }
687 return;
688 }
689 rxoff -= hlen;
690
691 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
692 /*
693 * Compute the expected 16-bit 1's complement sum;
694 * skip this if we've already computed it above
695 * when partial checksum offload is forced.
696 */
697 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
698
699 /* Hardware or driver is buggy */
700 if (sum != m->m_pkthdr.csum_rx_val) {
701 hwcksum_dbg_bad_cksum++;
702 if (dlil_verbose) {
703 DLIL_PRINTF("%s: bad partial cksum value "
704 "0x%x (expected 0x%x) for mbuf "
705 "0x%llx [rx_start %d]\n",
706 if_name(ifp),
707 m->m_pkthdr.csum_rx_val, sum,
708 (uint64_t)VM_KERNEL_ADDRPERM(m),
709 m->m_pkthdr.csum_rx_start);
710 }
711 return;
712 }
713 }
714 hwcksum_dbg_verified++;
715
716 /*
717 * This code allows us to emulate various hardwares that
718 * perform 16-bit 1's complement sum beginning at various
719 * start offset values.
720 */
721 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
722 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
723
724 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
725 return;
726 }
727
728 sum = m_adj_sum16(m, rxoff, aoff,
729 m_pktlen(m) - aoff, sum);
730
731 m->m_pkthdr.csum_rx_val = sum;
732 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
733
734 hwcksum_dbg_adjusted++;
735 }
736 }
737 }
738
739 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,mbuf_ref_t m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)740 dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m,
741 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
742 {
743 int error = 0;
744 protocol_family_t protocol_family;
745 mbuf_t next_packet;
746 ifnet_t ifp = ifp_param;
747 char *__single frame_header = NULL;
748 if_proto_ref_t last_ifproto = NULL;
749 mbuf_t pkt_first = NULL;
750 mbuf_t *pkt_next = NULL;
751 u_int32_t poll_thresh = 0, poll_ival = 0;
752 int iorefcnt = 0;
753 boolean_t skip_bridge_filter = FALSE;
754
755 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
756
757 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
758 (poll_ival = if_rxpoll_interval_pkts) > 0) {
759 poll_thresh = cnt;
760 }
761 if (bridge_enable_early_input != 0 &&
762 ifp != NULL && ifp->if_bridge != NULL) {
763 m = handle_bridge_early_input(ifp, m, cnt);
764 skip_bridge_filter = TRUE;
765 }
766 while (m != NULL) {
767 if_proto_ref_t ifproto = NULL;
768 uint32_t pktf_mask; /* pkt flags to preserve */
769
770 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
771 m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
772
773 if (ifp_param == NULL) {
774 ifp = m->m_pkthdr.rcvif;
775 }
776
777 if ((ifp->if_eflags & IFEF_RXPOLL) &&
778 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
779 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
780 ifnet_poll(ifp);
781 }
782
783 /* Check if this mbuf looks valid */
784 MBUF_INPUT_CHECK(m, ifp);
785
786 next_packet = m->m_nextpkt;
787 m->m_nextpkt = NULL;
788 frame_header = m->m_pkthdr.pkt_hdr;
789 m->m_pkthdr.pkt_hdr = NULL;
790
791 /*
792 * Get an IO reference count if the interface is not
793 * loopback (lo0) and it is attached; lo0 never goes
794 * away, so optimize for that.
795 */
796 if (ifp != lo_ifp) {
797 /* iorefcnt is 0 if it hasn't been taken yet */
798 if (iorefcnt == 0) {
799 if (!ifnet_datamov_begin(ifp)) {
800 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_DATAMOV_BEGIN, NULL, 0);
801 goto next;
802 }
803 }
804 iorefcnt = 1;
805 /*
806 * Preserve the time stamp and skip pktap flags.
807 */
808 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
809 } else {
810 /*
811 * If this arrived on lo0, preserve interface addr
812 * info to allow for connectivity between loopback
813 * and local interface addresses.
814 */
815 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
816 }
817 pktf_mask |= PKTF_WAKE_PKT;
818
819 /* make sure packet comes in clean */
820 m_classifier_init(m, pktf_mask);
821
822 ifp_inc_traffic_class_in(ifp, m);
823
824 /* find which protocol family this packet is for */
825 ifnet_lock_shared(ifp);
826 error = (*ifp->if_demux)(ifp, m, frame_header,
827 &protocol_family);
828 ifnet_lock_done(ifp);
829 if (error != 0) {
830 if (error == EJUSTRETURN) {
831 goto next;
832 }
833 protocol_family = 0;
834 }
835 /* check for an updated frame header */
836 if (m->m_pkthdr.pkt_hdr != NULL) {
837 frame_header = m->m_pkthdr.pkt_hdr;
838 m->m_pkthdr.pkt_hdr = NULL;
839 }
840
841 #if (DEVELOPMENT || DEBUG)
842 /*
843 * For testing we do not care about broadcast and multicast packets as
844 * they are not as controllable as unicast traffic
845 */
846 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
847 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
848 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
849 /*
850 * This is a one-shot command
851 */
852 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
853 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
854 }
855 }
856 #endif /* (DEVELOPMENT || DEBUG) */
857 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
858 char buffer[64];
859 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
860
861 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
862 ifp->if_xname, m_pktlen(m));
863 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
864 log_hexdump(buffer, buflen);
865 }
866 }
867
868 pktap_input(ifp, protocol_family, m, frame_header);
869
870 /* Drop v4 packets received on CLAT46 enabled cell interface */
871 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
872 ifp->if_type == IFT_CELLULAR) {
873 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
874 ip6stat.ip6s_clat464_in_v4_drop++;
875 goto next;
876 }
877
878 /* Translate the packet if it is received on CLAT interface */
879 if ((m->m_flags & M_PROMISC) == 0 &&
880 protocol_family == PF_INET6 &&
881 IS_INTF_CLAT46(ifp) &&
882 dlil_is_clat_needed(protocol_family, m)) {
883 char *data = NULL;
884 struct ether_header eh;
885 struct ether_header *ehp = NULL;
886
887 if (ifp->if_type == IFT_ETHER) {
888 ehp = (struct ether_header *)(void *)frame_header;
889 /* Skip RX Ethernet packets if they are not IPV6 */
890 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
891 goto skip_clat;
892 }
893
894 /* Keep a copy of frame_header for Ethernet packets */
895 char *fh = __unsafe_forge_bidi_indexable(char *, m->m_pkthdr.pkt_hdr, ifnet_hdrlen(ifp));
896 if (fh) {
897 bcopy(fh, (caddr_t)&eh, ETHER_HDR_LEN);
898 }
899 }
900 error = dlil_clat64(ifp, &protocol_family, &m);
901 data = mtod(m, char*);
902 if (error != 0) {
903 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
904 ip6stat.ip6s_clat464_in_drop++;
905 goto next;
906 }
907 /* Native v6 should be No-op */
908 if (protocol_family != PF_INET) {
909 goto skip_clat;
910 }
911
912 /* Do this only for translated v4 packets. */
913 switch (ifp->if_type) {
914 case IFT_CELLULAR:
915 frame_header = data;
916 break;
917 case IFT_ETHER:
918 /*
919 * Drop if the mbuf doesn't have enough
920 * space for Ethernet header
921 */
922 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
923 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
924 ip6stat.ip6s_clat464_in_drop++;
925 goto next;
926 }
927 /*
928 * Set the frame_header ETHER_HDR_LEN bytes
929 * preceeding the data pointer. Change
930 * the ether_type too.
931 * N.B. The variable `fh' is needed because
932 * the `frame_header' variable is `__single',
933 * and hence would not be appropriate for use with `bcopy'.
934 */
935 char *fh = data - ETHER_HDR_LEN;
936 frame_header = fh;
937 eh.ether_type = htons(ETHERTYPE_IP);
938 bcopy((caddr_t)&eh, fh, ETHER_HDR_LEN);
939 break;
940 }
941 }
942 skip_clat:
943 /*
944 * Match the wake packet against the list of ports that has been
945 * been queried by the driver before the device went to sleep
946 */
947 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
948 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
949 if_ports_used_match_mbuf(ifp, protocol_family, m);
950 }
951 }
952 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
953 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
954 dlil_input_cksum_dbg(ifp, m, frame_header,
955 protocol_family);
956 }
957 /*
958 * For partial checksum offload, we expect the driver to
959 * set the start offset indicating the start of the span
960 * that is covered by the hardware-computed checksum;
961 * adjust this start offset accordingly because the data
962 * pointer has been advanced beyond the link-layer header.
963 *
964 * Virtual lan types (bridge, vlan, bond) can call
965 * dlil_input_packet_list() with the same packet with the
966 * checksum flags set. Set a flag indicating that the
967 * adjustment has already been done.
968 */
969 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
970 /* adjustment has already been done */
971 } else if ((m->m_pkthdr.csum_flags &
972 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
973 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
974 int adj;
975 if (frame_header == NULL ||
976 frame_header < (char *)mbuf_datastart(m) ||
977 frame_header > (char *)m->m_data ||
978 (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
979 m->m_pkthdr.csum_rx_start) {
980 m->m_pkthdr.csum_data = 0;
981 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
982 hwcksum_in_invalidated++;
983 } else {
984 m->m_pkthdr.csum_rx_start -= adj;
985 }
986 /* make sure we don't adjust more than once */
987 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
988 }
989 if (clat_debug) {
990 pktap_input(ifp, protocol_family, m, frame_header);
991 }
992
993 if (m->m_flags & (M_BCAST | M_MCAST)) {
994 os_atomic_inc(&ifp->if_imcasts, relaxed);
995 }
996
997 /* run interface filters */
998 error = dlil_interface_filters_input(ifp, &m,
999 &frame_header, protocol_family, skip_bridge_filter);
1000 if (error != 0) {
1001 if (error != EJUSTRETURN) {
1002 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
1003 }
1004 goto next;
1005 }
1006 /*
1007 * A VLAN and Bond interface receives packets by attaching
1008 * a "protocol" to the underlying interface.
1009 * A promiscuous packet needs to be delivered to the
1010 * VLAN or Bond interface since:
1011 * - Bond interface member may not support setting the
1012 * MAC address, so packets are inherently "promiscuous"
1013 * - A VLAN or Bond interface could be members of a bridge,
1014 * where promiscuous packets correspond to other
1015 * devices that the bridge forwards packets to/from
1016 */
1017 if ((m->m_flags & M_PROMISC) != 0) {
1018 switch (protocol_family) {
1019 case PF_VLAN:
1020 case PF_BOND:
1021 /* VLAN and Bond get promiscuous packets */
1022 break;
1023 default:
1024 if (droptap_verbose > 0) {
1025 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_PROMISC, NULL, 0);
1026 } else {
1027 m_freem(m);
1028 }
1029 goto next;
1030 }
1031 }
1032
1033 /* Lookup the protocol attachment to this interface */
1034 if (protocol_family == 0) {
1035 ifproto = NULL;
1036 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
1037 (last_ifproto->protocol_family == protocol_family)) {
1038 VERIFY(ifproto == NULL);
1039 ifproto = last_ifproto;
1040 if_proto_ref(last_ifproto);
1041 } else {
1042 VERIFY(ifproto == NULL);
1043 ifnet_lock_shared(ifp);
1044 /* callee holds a proto refcnt upon success */
1045 ifproto = find_attached_proto(ifp, protocol_family);
1046 ifnet_lock_done(ifp);
1047 }
1048 if (ifproto == NULL) {
1049 /* no protocol for this packet, discard */
1050 m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0);
1051 goto next;
1052 }
1053 if (ifproto != last_ifproto) {
1054 if (last_ifproto != NULL) {
1055 /* pass up the list for the previous protocol */
1056 dlil_ifproto_input(last_ifproto, pkt_first);
1057 pkt_first = NULL;
1058 if_proto_free(last_ifproto);
1059 }
1060 last_ifproto = ifproto;
1061 if_proto_ref(ifproto);
1062 }
1063 /* extend the list */
1064 m->m_pkthdr.pkt_hdr = frame_header;
1065 if (pkt_first == NULL) {
1066 pkt_first = m;
1067 } else {
1068 *pkt_next = m;
1069 }
1070 pkt_next = &m->m_nextpkt;
1071
1072 next:
1073 if (next_packet == NULL && last_ifproto != NULL) {
1074 /* pass up the last list of packets */
1075 dlil_ifproto_input(last_ifproto, pkt_first);
1076 if_proto_free(last_ifproto);
1077 last_ifproto = NULL;
1078 }
1079 if (ifproto != NULL) {
1080 if_proto_free(ifproto);
1081 ifproto = NULL;
1082 }
1083
1084 m = next_packet;
1085
1086 /* update the driver's multicast filter, if needed */
1087 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
1088 ifp->if_updatemcasts = 0;
1089 }
1090 if (iorefcnt == 1) {
1091 /* If the next mbuf is on a different interface, unlock data-mov */
1092 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
1093 ifnet_datamov_end(ifp);
1094 iorefcnt = 0;
1095 }
1096 }
1097 }
1098
1099 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
1100 }
1101
1102 /*
1103 * Input thread for interfaces with legacy input model.
1104 */
1105 __attribute__((noreturn))
1106 static void
dlil_input_thread_func(void * v,wait_result_t w)1107 dlil_input_thread_func(void *v, wait_result_t w)
1108 {
1109 #pragma unused(w)
1110 char thread_name_storage[MAXTHREADNAMESIZE];
1111 const char *__null_terminated thread_name;
1112 dlil_threading_info_ref_t inp = v;
1113 ifnet_ref_t ifp = inp->dlth_ifp;
1114
1115 VERIFY(inp != dlil_main_input_thread);
1116 VERIFY(ifp != NULL);
1117 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
1118 !(ifp->if_xflags & IFXF_LEGACY));
1119 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
1120 !(ifp->if_xflags & IFXF_LEGACY));
1121 VERIFY(current_thread() == inp->dlth_thread);
1122
1123 /* construct the name for this thread, and then apply it */
1124 bzero(thread_name_storage, sizeof(thread_name_storage));
1125 thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1126 "dlil_input_%s", ifp->if_xname);
1127 thread_set_thread_name(inp->dlth_thread, thread_name);
1128
1129 lck_mtx_lock(&inp->dlth_lock);
1130 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1131 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1132 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1133 /* wake up once to get out of embryonic state */
1134 dlil_input_wakeup(inp);
1135 lck_mtx_unlock(&inp->dlth_lock);
1136 (void) thread_block_parameter(dlil_input_thread_cont, inp);
1137 /* NOTREACHED */
1138 __builtin_unreachable();
1139 }
1140
1141 __attribute__((noreturn))
1142 static void
dlil_input_thread_cont(void * v,wait_result_t wres)1143 dlil_input_thread_cont(void *v, wait_result_t wres)
1144 {
1145 dlil_threading_info_ref_t inp = v;
1146 ifnet_ref_t ifp = inp->dlth_ifp;
1147
1148 lck_mtx_lock_spin(&inp->dlth_lock);
1149 if (__improbable(wres == THREAD_INTERRUPTED ||
1150 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1151 goto terminate;
1152 }
1153
1154 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1155 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1156
1157 while (1) {
1158 struct mbuf *m = NULL;
1159 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1160 boolean_t notify = FALSE;
1161 boolean_t embryonic;
1162 u_int32_t m_cnt;
1163
1164 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1165
1166 if (__improbable(embryonic =
1167 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1168 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1169 }
1170
1171 /*
1172 * Protocol registration and injection must always use
1173 * the main input thread; in theory the latter can utilize
1174 * the corresponding input thread where the packet arrived
1175 * on, but that requires our knowing the interface in advance
1176 * (and the benefits might not worth the trouble.)
1177 */
1178 VERIFY(!(inp->dlth_flags &
1179 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1180
1181 /* Packets for this interface */
1182 m_cnt = qlen(&inp->dlth_pkts);
1183 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1184 m = pkt.cp_mbuf;
1185
1186 inp->dlth_wtot = 0;
1187
1188 #if SKYWALK
1189 /*
1190 * If this interface is attached to a netif nexus,
1191 * the stats are already incremented there; otherwise
1192 * do it here.
1193 */
1194 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
1195 #endif /* SKYWALK */
1196 notify = dlil_input_stats_sync(ifp, inp);
1197
1198 lck_mtx_unlock(&inp->dlth_lock);
1199
1200 if (__improbable(embryonic)) {
1201 ifnet_decr_pending_thread_count(ifp);
1202 }
1203
1204 if (__improbable(notify)) {
1205 ifnet_notify_data_threshold(ifp);
1206 }
1207
1208 /*
1209 * NOTE warning %%% attention !!!!
1210 * We should think about putting some thread starvation
1211 * safeguards if we deal with long chains of packets.
1212 */
1213 if (__probable(m != NULL)) {
1214 dlil_input_packet_list_extended(ifp, m,
1215 m_cnt, ifp->if_poll_mode);
1216 }
1217
1218 lck_mtx_lock_spin(&inp->dlth_lock);
1219 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1220 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1221 DLIL_INPUT_TERMINATE))) {
1222 break;
1223 }
1224 }
1225
1226 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1227
1228 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1229 terminate:
1230 lck_mtx_unlock(&inp->dlth_lock);
1231 dlil_terminate_input_thread(inp);
1232 /* NOTREACHED */
1233 } else {
1234 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1235 lck_mtx_unlock(&inp->dlth_lock);
1236 (void) thread_block_parameter(dlil_input_thread_cont, inp);
1237 /* NOTREACHED */
1238 }
1239
1240 VERIFY(0); /* we should never get here */
1241 /* NOTREACHED */
1242 __builtin_unreachable();
1243 }
1244
1245 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)1246 dlil_input_wakeup(struct dlil_threading_info *inp)
1247 {
1248 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
1249
1250 inp->dlth_flags |= DLIL_INPUT_WAITING;
1251 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
1252 inp->dlth_wtot++;
1253 wakeup_one((caddr_t)&inp->dlth_flags);
1254 }
1255 }
1256
1257 static int
dlil_interface_filters_input(struct ifnet * ifp,mbuf_ref_ref_t m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)1258 dlil_interface_filters_input(struct ifnet *ifp, mbuf_ref_ref_t m_p,
1259 char **frame_header_p, protocol_family_t protocol_family,
1260 boolean_t skip_bridge)
1261 {
1262 boolean_t is_vlan_packet = FALSE;
1263 struct ifnet_filter *filter;
1264 struct mbuf *m = *m_p;
1265
1266 is_vlan_packet = packet_has_vlan_tag(m);
1267
1268 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
1269 return 0;
1270 }
1271
1272 /*
1273 * Pass the inbound packet to the interface filters
1274 */
1275 lck_mtx_lock_spin(&ifp->if_flt_lock);
1276 /* prevent filter list from changing in case we drop the lock */
1277 if_flt_monitor_busy(ifp);
1278 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
1279 int result;
1280
1281 /* exclude VLAN packets from external filters PR-3586856 */
1282 if (is_vlan_packet &&
1283 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
1284 continue;
1285 }
1286 /* the bridge has already seen the packet */
1287 if (skip_bridge &&
1288 (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
1289 continue;
1290 }
1291 if (!filter->filt_skip && filter->filt_input != NULL &&
1292 (filter->filt_protocol == 0 ||
1293 filter->filt_protocol == protocol_family)) {
1294 lck_mtx_unlock(&ifp->if_flt_lock);
1295
1296 result = (*filter->filt_input)(filter->filt_cookie,
1297 ifp, protocol_family, m_p, frame_header_p);
1298
1299 lck_mtx_lock_spin(&ifp->if_flt_lock);
1300 if (result != 0) {
1301 /* we're done with the filter list */
1302 if_flt_monitor_unbusy(ifp);
1303 lck_mtx_unlock(&ifp->if_flt_lock);
1304 return result;
1305 }
1306 }
1307 }
1308 /* we're done with the filter list */
1309 if_flt_monitor_unbusy(ifp);
1310 lck_mtx_unlock(&ifp->if_flt_lock);
1311
1312 /*
1313 * Strip away M_PROTO1 bit prior to sending packet up the stack as
1314 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
1315 */
1316 if (*m_p != NULL) {
1317 (*m_p)->m_flags &= ~M_PROTO1;
1318 }
1319
1320 return 0;
1321 }
1322
1323 __attribute__((noreturn))
1324 static void
dlil_main_input_thread_func(void * v,wait_result_t w)1325 dlil_main_input_thread_func(void *v, wait_result_t w)
1326 {
1327 #pragma unused(w)
1328 dlil_threading_info_ref_t inp = v;
1329
1330 VERIFY(inp == dlil_main_input_thread);
1331 VERIFY(inp->dlth_ifp == NULL);
1332 VERIFY(current_thread() == inp->dlth_thread);
1333
1334 lck_mtx_lock(&inp->dlth_lock);
1335 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1336 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1337 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1338 /* wake up once to get out of embryonic state */
1339 dlil_input_wakeup(inp);
1340 lck_mtx_unlock(&inp->dlth_lock);
1341 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1342 /* NOTREACHED */
1343 __builtin_unreachable();
1344 }
1345
1346 /*
1347 * Main input thread:
1348 *
1349 * a) handles all inbound packets for lo0
1350 * b) handles all inbound packets for interfaces with no dedicated
1351 * input thread (e.g. anything but Ethernet/PDP or those that support
1352 * opportunistic polling.)
1353 * c) protocol registrations
1354 * d) packet injections
1355 */
1356 __attribute__((noreturn))
1357 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)1358 dlil_main_input_thread_cont(void *v, wait_result_t wres)
1359 {
1360 dlil_main_threading_info_ref_t inpm = v;
1361 dlil_threading_info_ref_t inp = v;
1362
1363 /* main input thread is uninterruptible */
1364 VERIFY(wres != THREAD_INTERRUPTED);
1365 lck_mtx_lock_spin(&inp->dlth_lock);
1366 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
1367 DLIL_INPUT_RUNNING)));
1368 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1369
1370 while (1) {
1371 struct mbuf *m = NULL, *m_loop = NULL;
1372 u_int32_t m_cnt, m_cnt_loop;
1373 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1374 boolean_t proto_req;
1375 boolean_t embryonic;
1376
1377 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1378
1379 if (__improbable(embryonic =
1380 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1381 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1382 }
1383
1384 proto_req = (inp->dlth_flags &
1385 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1386
1387 /* Packets for non-dedicated interfaces other than lo0 */
1388 m_cnt = qlen(&inp->dlth_pkts);
1389 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1390 m = pkt.cp_mbuf;
1391
1392 /* Packets exclusive to lo0 */
1393 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1394 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
1395 m_loop = pkt.cp_mbuf;
1396
1397 inp->dlth_wtot = 0;
1398
1399 lck_mtx_unlock(&inp->dlth_lock);
1400
1401 if (__improbable(embryonic)) {
1402 dlil_decr_pending_thread_count();
1403 }
1404
1405 /*
1406 * NOTE warning %%% attention !!!!
1407 * We should think about putting some thread starvation
1408 * safeguards if we deal with long chains of packets.
1409 */
1410 if (__probable(m_loop != NULL)) {
1411 dlil_input_packet_list_extended(lo_ifp, m_loop,
1412 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
1413 }
1414
1415 if (__probable(m != NULL)) {
1416 dlil_input_packet_list_extended(NULL, m,
1417 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
1418 }
1419
1420 if (__improbable(proto_req)) {
1421 proto_input_run();
1422 }
1423
1424 lck_mtx_lock_spin(&inp->dlth_lock);
1425 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1426 /* main input thread cannot be terminated */
1427 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
1428 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
1429 break;
1430 }
1431 }
1432
1433 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1434 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1435 lck_mtx_unlock(&inp->dlth_lock);
1436 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1437
1438 VERIFY(0); /* we should never get here */
1439 /* NOTREACHED */
1440 __builtin_unreachable();
1441 }
1442
1443 /*
1444 * Input thread for interfaces with opportunistic polling input model.
1445 */
1446 __attribute__((noreturn))
1447 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)1448 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1449 {
1450 #pragma unused(w)
1451 char thread_name_storage[MAXTHREADNAMESIZE];
1452 const char *__null_terminated thread_name;
1453 dlil_threading_info_ref_t inp = v;
1454 ifnet_ref_t ifp = inp->dlth_ifp;
1455
1456 VERIFY(inp != dlil_main_input_thread);
1457 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
1458 (ifp->if_xflags & IFXF_LEGACY));
1459 VERIFY(current_thread() == inp->dlth_thread);
1460
1461 /* construct the name for this thread, and then apply it */
1462 bzero(thread_name_storage, sizeof(thread_name_storage));
1463 thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1464 "dlil_input_poll_%s", ifp->if_xname);
1465 thread_set_thread_name(inp->dlth_thread, thread_name);
1466
1467 lck_mtx_lock(&inp->dlth_lock);
1468 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1469 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1470 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1471 /* wake up once to get out of embryonic state */
1472 dlil_input_wakeup(inp);
1473 lck_mtx_unlock(&inp->dlth_lock);
1474 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
1475 /* NOTREACHED */
1476 __builtin_unreachable();
1477 }
1478
1479 __attribute__((noreturn))
1480 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)1481 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
1482 {
1483 dlil_threading_info_ref_t inp = v;
1484 ifnet_ref_t ifp = inp->dlth_ifp;
1485 struct timespec ts;
1486
1487 lck_mtx_lock_spin(&inp->dlth_lock);
1488 if (__improbable(wres == THREAD_INTERRUPTED ||
1489 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1490 goto terminate;
1491 }
1492
1493 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1494 inp->dlth_flags |= DLIL_INPUT_RUNNING;
1495
1496 while (1) {
1497 struct mbuf *m = NULL;
1498 uint32_t m_cnt, poll_req = 0;
1499 uint64_t m_size = 0;
1500 ifnet_model_t mode;
1501 struct timespec now, delta;
1502 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1503 boolean_t notify;
1504 boolean_t embryonic;
1505 uint64_t ival;
1506
1507 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1508
1509 if (__improbable(embryonic =
1510 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1511 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1512 goto skip;
1513 }
1514
1515 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
1516 ival = IF_RXPOLL_INTERVALTIME_MIN;
1517 }
1518
1519 /* Link parameters changed? */
1520 if (ifp->if_poll_update != 0) {
1521 ifp->if_poll_update = 0;
1522 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1523 }
1524
1525 /* Current operating mode */
1526 mode = ifp->if_poll_mode;
1527
1528 /*
1529 * Protocol registration and injection must always use
1530 * the main input thread; in theory the latter can utilize
1531 * the corresponding input thread where the packet arrived
1532 * on, but that requires our knowing the interface in advance
1533 * (and the benefits might not worth the trouble.)
1534 */
1535 VERIFY(!(inp->dlth_flags &
1536 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1537
1538 /* Total count of all packets */
1539 m_cnt = qlen(&inp->dlth_pkts);
1540
1541 /* Total bytes of all packets */
1542 m_size = qsize(&inp->dlth_pkts);
1543
1544 /* Packets for this interface */
1545 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1546 m = pkt.cp_mbuf;
1547 VERIFY(m != NULL || m_cnt == 0);
1548
1549 nanouptime(&now);
1550 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
1551 *(&ifp->if_poll_sample_lasttime) = *(&now);
1552 }
1553
1554 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
1555 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
1556 u_int32_t ptot, btot;
1557
1558 /* Accumulate statistics for current sampling */
1559 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
1560
1561 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
1562 goto skip;
1563 }
1564
1565 *(&ifp->if_poll_sample_lasttime) = *(&now);
1566
1567 /* Calculate min/max of inbound bytes */
1568 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
1569 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
1570 ifp->if_rxpoll_bmin = btot;
1571 }
1572 if (btot > ifp->if_rxpoll_bmax) {
1573 ifp->if_rxpoll_bmax = btot;
1574 }
1575
1576 /* Calculate EWMA of inbound bytes */
1577 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
1578
1579 /* Calculate min/max of inbound packets */
1580 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
1581 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
1582 ifp->if_rxpoll_pmin = ptot;
1583 }
1584 if (ptot > ifp->if_rxpoll_pmax) {
1585 ifp->if_rxpoll_pmax = ptot;
1586 }
1587
1588 /* Calculate EWMA of inbound packets */
1589 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
1590
1591 /* Reset sampling statistics */
1592 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
1593
1594 /* Calculate EWMA of wakeup requests */
1595 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
1596 if_rxpoll_decay);
1597 inp->dlth_wtot = 0;
1598
1599 if (dlil_verbose) {
1600 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
1601 *(&ifp->if_poll_dbg_lasttime) = *(&now);
1602 }
1603 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
1604 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1605 *(&ifp->if_poll_dbg_lasttime) = *(&now);
1606 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
1607 "limits [%d/%d], wreq avg %d "
1608 "limits [%d/%d], bytes avg %d "
1609 "limits [%d/%d]\n", if_name(ifp),
1610 (ifp->if_poll_mode ==
1611 IFNET_MODEL_INPUT_POLL_ON) ?
1612 "ON" : "OFF", ifp->if_rxpoll_pavg,
1613 ifp->if_rxpoll_pmax,
1614 ifp->if_rxpoll_plowat,
1615 ifp->if_rxpoll_phiwat,
1616 ifp->if_rxpoll_wavg,
1617 ifp->if_rxpoll_wlowat,
1618 ifp->if_rxpoll_whiwat,
1619 ifp->if_rxpoll_bavg,
1620 ifp->if_rxpoll_blowat,
1621 ifp->if_rxpoll_bhiwat);
1622 }
1623 }
1624
1625 /* Perform mode transition, if necessary */
1626 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
1627 *(&ifp->if_poll_mode_lasttime) = *(&now);
1628 }
1629
1630 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
1631 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
1632 goto skip;
1633 }
1634
1635 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
1636 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
1637 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
1638 mode = IFNET_MODEL_INPUT_POLL_OFF;
1639 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
1640 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
1641 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
1642 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
1643 mode = IFNET_MODEL_INPUT_POLL_ON;
1644 }
1645
1646 if (mode != ifp->if_poll_mode) {
1647 ifp->if_poll_mode = mode;
1648 *(&ifp->if_poll_mode_lasttime) = *(&now);
1649 poll_req++;
1650 }
1651 }
1652 skip:
1653 notify = dlil_input_stats_sync(ifp, inp);
1654
1655 lck_mtx_unlock(&inp->dlth_lock);
1656
1657 if (__improbable(embryonic)) {
1658 ifnet_decr_pending_thread_count(ifp);
1659 }
1660
1661 if (__improbable(notify)) {
1662 ifnet_notify_data_threshold(ifp);
1663 }
1664
1665 /*
1666 * If there's a mode change and interface is still attached,
1667 * perform a downcall to the driver for the new mode. Also
1668 * hold an IO refcnt on the interface to prevent it from
1669 * being detached (will be release below.)
1670 */
1671 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
1672 struct ifnet_model_params p = {
1673 .model = mode, .reserved = { 0 }
1674 };
1675 errno_t err;
1676
1677 if (dlil_verbose) {
1678 DLIL_PRINTF("%s: polling is now %s, "
1679 "pkts avg %d max %d limits [%d/%d], "
1680 "wreq avg %d limits [%d/%d], "
1681 "bytes avg %d limits [%d/%d]\n",
1682 if_name(ifp),
1683 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1684 "ON" : "OFF", ifp->if_rxpoll_pavg,
1685 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
1686 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
1687 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
1688 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
1689 ifp->if_rxpoll_bhiwat);
1690 }
1691
1692 if ((err = ((*ifp->if_input_ctl)(ifp,
1693 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
1694 DLIL_PRINTF("%s: error setting polling mode "
1695 "to %s (%d)\n", if_name(ifp),
1696 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1697 "ON" : "OFF", err);
1698 }
1699
1700 switch (mode) {
1701 case IFNET_MODEL_INPUT_POLL_OFF:
1702 ifnet_set_poll_cycle(ifp, NULL);
1703 ifp->if_rxpoll_offreq++;
1704 if (err != 0) {
1705 ifp->if_rxpoll_offerr++;
1706 }
1707 break;
1708
1709 case IFNET_MODEL_INPUT_POLL_ON:
1710 net_nsectimer(&ival, &ts);
1711 ifnet_set_poll_cycle(ifp, &ts);
1712 ifnet_poll(ifp);
1713 ifp->if_rxpoll_onreq++;
1714 if (err != 0) {
1715 ifp->if_rxpoll_onerr++;
1716 }
1717 break;
1718
1719 default:
1720 VERIFY(0);
1721 /* NOTREACHED */
1722 }
1723
1724 /* Release the IO refcnt */
1725 ifnet_decr_iorefcnt(ifp);
1726 }
1727
1728 /*
1729 * NOTE warning %%% attention !!!!
1730 * We should think about putting some thread starvation
1731 * safeguards if we deal with long chains of packets.
1732 */
1733 if (__probable(m != NULL)) {
1734 dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
1735 }
1736
1737 lck_mtx_lock_spin(&inp->dlth_lock);
1738 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1739 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1740 DLIL_INPUT_TERMINATE))) {
1741 break;
1742 }
1743 }
1744
1745 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1746
1747 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1748 terminate:
1749 lck_mtx_unlock(&inp->dlth_lock);
1750 dlil_terminate_input_thread(inp);
1751 /* NOTREACHED */
1752 } else {
1753 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1754 lck_mtx_unlock(&inp->dlth_lock);
1755 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
1756 inp);
1757 /* NOTREACHED */
1758 }
1759
1760 VERIFY(0); /* we should never get here */
1761 /* NOTREACHED */
1762 __builtin_unreachable();
1763 }
1764
1765 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)1766 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
1767 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
1768 {
1769 uint32_t overcommitted_qlen; /* Length in packets. */
1770 uint64_t overcommitted_qsize; /* Size in bytes. */
1771 uint32_t target_qlen; /* The desired queue length after trimming. */
1772 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
1773 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
1774 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
1775 struct mbuf *m = NULL, *m_tmp = NULL;
1776
1777 overcommitted_qlen = qlen(input_queue);
1778 overcommitted_qsize = qsize(input_queue);
1779 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
1780
1781 if (overcommitted_qlen <= target_qlen) {
1782 /*
1783 * The queue is already within the target limits.
1784 */
1785 dropped_pkts = 0;
1786 goto out;
1787 }
1788
1789 pkts_to_drop = overcommitted_qlen - target_qlen;
1790
1791 /*
1792 * Proceed to removing packets from the head of the queue,
1793 * starting from the oldest, until the desired number of packets
1794 * has been dropped.
1795 */
1796 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
1797 if (pkts_to_drop <= dropped_pkts) {
1798 break;
1799 }
1800 MBUFQ_REMOVE(&qmbufq(input_queue), m);
1801 MBUFQ_NEXT(m) = NULL;
1802 MBUFQ_ENQUEUE(freeq, m);
1803
1804 dropped_pkts += 1;
1805 dropped_bytes += m_length(m);
1806 }
1807
1808 /*
1809 * Adjust the length and the estimated size of the queue
1810 * after trimming.
1811 */
1812 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
1813 qlen(input_queue) = target_qlen;
1814
1815 /* qsize() is an approximation. */
1816 if (dropped_bytes < qsize(input_queue)) {
1817 qsize(input_queue) -= dropped_bytes;
1818 } else {
1819 qsize(input_queue) = 0;
1820 }
1821
1822 /*
1823 * Adjust the ifnet statistics increments, if needed.
1824 */
1825 stat_delta->dropped += dropped_pkts;
1826 if (dropped_pkts < stat_delta->packets_in) {
1827 stat_delta->packets_in -= dropped_pkts;
1828 } else {
1829 stat_delta->packets_in = 0;
1830 }
1831 if (dropped_bytes < stat_delta->bytes_in) {
1832 stat_delta->bytes_in -= dropped_bytes;
1833 } else {
1834 stat_delta->bytes_in = 0;
1835 }
1836
1837 out:
1838 if (dlil_verbose) {
1839 /*
1840 * The basic information about the drop is logged
1841 * by the invoking function (dlil_input_{,a}sync).
1842 * If `dlil_verbose' flag is set, provide more information
1843 * that can be useful for debugging.
1844 */
1845 DLIL_PRINTF("%s: "
1846 "qlen: %u -> %u, "
1847 "qsize: %llu -> %llu "
1848 "qlimit: %u (sysctl: %u) "
1849 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
1850 "dropped_pkts: %u dropped_bytes %u\n",
1851 __func__,
1852 overcommitted_qlen, qlen(input_queue),
1853 overcommitted_qsize, qsize(input_queue),
1854 qlimit(input_queue), if_rcvq_burst_limit,
1855 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
1856 dropped_pkts, dropped_bytes);
1857 }
1858
1859 return dropped_pkts;
1860 }
1861
1862 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)1863 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
1864 {
1865 lck_mtx_lock_spin(&ifp->if_flt_lock);
1866 if_flt_monitor_busy(ifp);
1867 lck_mtx_unlock(&ifp->if_flt_lock);
1868
1869 if (ifp->if_bridge != NULL) {
1870 m = bridge_early_input(ifp, m, cnt);
1871 }
1872 lck_mtx_lock_spin(&ifp->if_flt_lock);
1873 if_flt_monitor_unbusy(ifp);
1874 lck_mtx_unlock(&ifp->if_flt_lock);
1875 return m;
1876 }
1877