xref: /xnu-12377.41.6/bsd/net/dlil_input.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <net/if_var.h>
30 #include <net/if_var_private.h>
31 #include <net/dlil_var_private.h>
32 #include <net/dlil.h>
33 #include <net/dlil_sysctl.h>
34 
35 
36 #define DLIL_EWMA(old, new, decay) do {                                 \
37 	u_int32_t _avg;                                                 \
38 	if ((_avg = (old)) > 0)                                         \
39 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
40 	else                                                            \
41 	        _avg = (new);                                           \
42 	(old) = _avg;                                                   \
43 } while (0)
44 
45 
46 /*
47  * Detect whether a queue contains a burst that needs to be trimmed.
48  */
49 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
50 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
51 	                        qtype(q) == QP_MBUF)
52 
53 
54 /* rate limit debug messages */
55 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
56 
57 extern void proto_input_run(void);
58 
59 static errno_t dlil_input_async(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
60 static errno_t dlil_input_sync(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
61 static void dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, protocol_family_t pf);
62 static void dlil_input_packet_list_common(struct ifnet *, mbuf_ref_t, u_int32_t, ifnet_model_t, boolean_t);
63 static void dlil_input_thread_func(void *, wait_result_t);
64 static void dlil_input_thread_cont(void *, wait_result_t);
65 static inline void dlil_input_wakeup(struct dlil_threading_info *inp);
66 
67 static int dlil_interface_filters_input(struct ifnet *, mbuf_ref_ref_t, char **, protocol_family_t, boolean_t);
68 
69 static void dlil_main_input_thread_func(void *, wait_result_t);
70 static void dlil_main_input_thread_cont(void *, wait_result_t);
71 
72 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
73 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
74 
75 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue, dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta);
76 
77 static inline mbuf_t handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt);
78 /*
79  * Publicly visible functions.
80  */
81 
82 int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)83 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
84     thread_continue_t *thfunc)
85 {
86 	boolean_t dlil_rxpoll_input;
87 	thread_continue_t func = NULL;
88 	u_int32_t limit;
89 	int error = 0;
90 
91 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
92 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
93 
94 	/* default strategy utilizes the DLIL worker thread */
95 	inp->dlth_strategy = dlil_input_async;
96 
97 	/* NULL ifp indicates the main input thread, called at dlil_init time */
98 	if (ifp == NULL) {
99 		/*
100 		 * Main input thread only.
101 		 */
102 		func = dlil_main_input_thread_func;
103 		VERIFY(inp == dlil_main_input_thread);
104 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
105 		    "main_input");
106 	} else if (dlil_rxpoll_input) {
107 		/*
108 		 * Legacy (non-netif) hybrid polling.
109 		 */
110 		func = dlil_rxpoll_input_thread_func;
111 		VERIFY(inp != dlil_main_input_thread);
112 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
113 		    "%s_input_poll", if_name(ifp));
114 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
115 		/*
116 		 * Asynchronous strategy.
117 		 */
118 		func = dlil_input_thread_func;
119 		VERIFY(inp != dlil_main_input_thread);
120 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
121 		    "%s_input", if_name(ifp));
122 	} else {
123 		/*
124 		 * Synchronous strategy if there's a netif below and
125 		 * the device isn't capable of hybrid polling.
126 		 */
127 		ASSERT(func == NULL);
128 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
129 		VERIFY(inp != dlil_main_input_thread);
130 		ASSERT(!inp->dlth_affinity);
131 		inp->dlth_strategy = dlil_input_sync;
132 		inp->dlth_name = __unsafe_null_terminated_from_indexable(inp->dlth_name_storage);
133 	}
134 	VERIFY(inp->dlth_thread == THREAD_NULL);
135 
136 	/* let caller know */
137 	if (thfunc != NULL) {
138 		*thfunc = func;
139 	}
140 
141 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
142 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
143 
144 	inp->dlth_ifp = ifp; /* NULL for main input thread */
145 
146 	/*
147 	 * For interfaces that support opportunistic polling, set the
148 	 * low and high watermarks for outstanding inbound packets/bytes.
149 	 * Also define freeze times for transitioning between modes
150 	 * and updating the average.
151 	 */
152 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
153 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
154 		if (ifp->if_xflags & IFXF_LEGACY) {
155 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
156 		}
157 	} else {
158 		/*
159 		 * For interfaces that don't support opportunistic
160 		 * polling, set the burst limit to prevent memory exhaustion.
161 		 * The values of `if_rcvq_burst_limit' are safeguarded
162 		 * on customer builds by `sysctl_rcvq_burst_limit'.
163 		 */
164 		limit = if_rcvq_burst_limit;
165 	}
166 
167 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
168 	if (inp == dlil_main_input_thread) {
169 		dlil_main_threading_info_ref_t inpm =
170 		    __container_of(inp, struct dlil_main_threading_info, inp);
171 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
172 	}
173 
174 	if (func == NULL) {
175 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
176 		ASSERT(error == 0);
177 		error = ENODEV;
178 		goto done;
179 	}
180 
181 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
182 	if (error == KERN_SUCCESS) {
183 		thread_precedence_policy_data_t info;
184 		__unused kern_return_t kret;
185 
186 		bzero(&info, sizeof(info));
187 		info.importance = 0;
188 		kret = thread_policy_set(inp->dlth_thread,
189 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
190 		    THREAD_PRECEDENCE_POLICY_COUNT);
191 		ASSERT(kret == KERN_SUCCESS);
192 		/*
193 		 * We create an affinity set so that the matching workloop
194 		 * thread or the starter thread (for loopback) can be
195 		 * scheduled on the same processor set as the input thread.
196 		 */
197 		if (net_affinity) {
198 			struct thread *tp __single = inp->dlth_thread;
199 			u_int32_t tag;
200 			/*
201 			 * Randomize to reduce the probability
202 			 * of affinity tag namespace collision.
203 			 */
204 			read_frandom(&tag, sizeof(tag));
205 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
206 				thread_reference(tp);
207 				inp->dlth_affinity_tag = tag;
208 				inp->dlth_affinity = TRUE;
209 			}
210 		}
211 	} else if (inp == dlil_main_input_thread) {
212 		panic_plain("%s: couldn't create main input thread", __func__);
213 		/* NOTREACHED */
214 	} else {
215 		panic_plain("%s: couldn't create %s input thread", __func__,
216 		    if_name(ifp));
217 		/* NOTREACHED */
218 	}
219 	OSAddAtomic(1, &cur_dlil_input_threads);
220 
221 done:
222 	return error;
223 }
224 
225 void
dlil_terminate_input_thread(struct dlil_threading_info * inp)226 dlil_terminate_input_thread(struct dlil_threading_info *inp)
227 {
228 	ifnet_ref_t ifp = inp->dlth_ifp;
229 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
230 
231 	VERIFY(current_thread() == inp->dlth_thread);
232 	VERIFY(inp != dlil_main_input_thread);
233 
234 	OSAddAtomic(-1, &cur_dlil_input_threads);
235 
236 #if TEST_INPUT_THREAD_TERMINATION
237 	{ /* do something useless that won't get optimized away */
238 		uint32_t        v = 1;
239 		for (uint32_t i = 0;
240 		    i < if_input_thread_termination_spin;
241 		    i++) {
242 			v = (i + 1) * v;
243 		}
244 		DLIL_PRINTF("the value is %d\n", v);
245 	}
246 #endif /* TEST_INPUT_THREAD_TERMINATION */
247 
248 	lck_mtx_lock_spin(&inp->dlth_lock);
249 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
250 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
251 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
252 	wakeup_one((caddr_t)&inp->dlth_flags);
253 	lck_mtx_unlock(&inp->dlth_lock);
254 
255 	/* free up pending packets */
256 	if (pkt.cp_mbuf != NULL) {
257 		mbuf_freem_list(pkt.cp_mbuf);
258 	}
259 
260 	/* for the extra refcnt from kernel_thread_start() */
261 	thread_deallocate(current_thread());
262 
263 	if (dlil_verbose) {
264 		DLIL_PRINTF("%s: input thread terminated\n",
265 		    if_name(ifp));
266 	}
267 
268 	/* this is the end */
269 	thread_terminate(current_thread());
270 	/* NOTREACHED */
271 }
272 
273 boolean_t
dlil_is_rxpoll_input(thread_continue_t func)274 dlil_is_rxpoll_input(thread_continue_t func)
275 {
276 	return func == dlil_rxpoll_input_thread_func;
277 }
278 
279 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)280 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
281     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
282     boolean_t poll, struct thread *tp)
283 {
284 	dlil_threading_info_ref_t inp = ifp->if_inp;
285 
286 	if (__improbable(inp == NULL)) {
287 		inp = dlil_main_input_thread;
288 	}
289 
290 #if (DEVELOPMENT || DEBUG)
291 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
292 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
293 	} else
294 #endif /* (DEVELOPMENT || DEBUG) */
295 	{
296 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
297 	}
298 }
299 
300 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)301 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
302 {
303 	return dlil_input_packet_list_common(ifp, m, 0,
304 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
305 }
306 
307 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)308 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
309     u_int32_t cnt, ifnet_model_t mode)
310 {
311 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
312 }
313 
314 /*
315  * Static function implementations.
316  */
317 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_ref_t m)318 dlil_ifproto_input(struct if_proto * ifproto, mbuf_ref_t m)
319 {
320 	int error;
321 
322 	if (ifproto->proto_kpi == kProtoKPI_v1) {
323 		/* Version 1 protocols get one packet at a time */
324 		while (m != NULL) {
325 			/*
326 			 * Version 1 KPI does not accept header len,
327 			 * hence the pointer to the frame header must be `__single'.
328 			 */
329 			char *frame_header_ptr __single;
330 
331 			mbuf_t next_packet;
332 
333 			next_packet = m->m_nextpkt;
334 			m->m_nextpkt = NULL;
335 			frame_header_ptr = m->m_pkthdr.pkt_hdr;
336 
337 			m->m_pkthdr.pkt_hdr = NULL;
338 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
339 			    ifproto->protocol_family, m, frame_header_ptr);
340 			if (error != 0 && error != EJUSTRETURN) {
341 				m_drop_if(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
342 			}
343 			m = next_packet;
344 		}
345 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
346 		/* Version 2 protocols support packet lists */
347 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
348 		    ifproto->protocol_family, m);
349 		if (error != 0 && error != EJUSTRETURN) {
350 			m_drop_list(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
351 		}
352 	}
353 }
354 
355 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)356 dlil_input_async(struct dlil_threading_info *inp,
357     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
358     const struct ifnet_stat_increment_param *s, boolean_t poll,
359     struct thread *tp)
360 {
361 	u_int32_t m_cnt = s->packets_in;
362 	u_int32_t m_size = s->bytes_in;
363 	boolean_t notify = FALSE;
364 	struct ifnet_stat_increment_param s_adj = *s;
365 	dlil_freeq_t freeq;
366 	MBUFQ_INIT(&freeq);
367 
368 	/*
369 	 * If there is a matching DLIL input thread associated with an
370 	 * affinity set, associate this thread with the same set.  We
371 	 * will only do this once.
372 	 */
373 	lck_mtx_lock_spin(&inp->dlth_lock);
374 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
375 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
376 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
377 		u_int32_t tag = inp->dlth_affinity_tag;
378 
379 		if (poll) {
380 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
381 			inp->dlth_poller_thread = tp;
382 		} else {
383 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
384 			inp->dlth_driver_thread = tp;
385 		}
386 		lck_mtx_unlock(&inp->dlth_lock);
387 
388 		/* Associate the current thread with the new affinity tag */
389 		(void) dlil_affinity_set(tp, tag);
390 
391 		/*
392 		 * Take a reference on the current thread; during detach,
393 		 * we will need to refer to it in order to tear down its
394 		 * affinity.
395 		 */
396 		thread_reference(tp);
397 		lck_mtx_lock_spin(&inp->dlth_lock);
398 	}
399 
400 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
401 
402 	/*
403 	 * Because of loopbacked multicast we cannot stuff the ifp in
404 	 * the rcvif of the packet header: loopback (lo0) packets use a
405 	 * dedicated list so that we can later associate them with lo_ifp
406 	 * on their way up the stack.  Packets for other interfaces without
407 	 * dedicated input threads go to the regular list.
408 	 */
409 	if (m_head != NULL) {
410 		classq_pkt_t head, tail;
411 		class_queue_t *input_queue;
412 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
413 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
414 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
415 			dlil_main_threading_info_ref_t inpm =
416 			    __container_of(inp, struct dlil_main_threading_info, inp);
417 			input_queue = &inpm->lo_rcvq_pkts;
418 		} else {
419 			input_queue = &inp->dlth_pkts;
420 		}
421 
422 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
423 
424 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
425 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
426 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
427 			inp->dlth_trim_cnt += 1;
428 
429 			os_log_error(OS_LOG_DEFAULT,
430 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
431 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
432 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
433 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
434 			    qlen(input_queue));
435 		}
436 	}
437 
438 #if IFNET_INPUT_SANITY_CHK
439 	/*
440 	 * Verify that the original stat increment parameter
441 	 * accurately describes the input chain `m_head`.
442 	 * This is not affected by the trimming of input queue.
443 	 */
444 	if (__improbable(dlil_input_sanity_check != 0)) {
445 		u_int32_t count = 0, size = 0;
446 		struct mbuf *m0;
447 
448 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
449 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
450 			size += m_length(m0);
451 			count++;
452 		}
453 
454 		if (count != m_cnt) {
455 			panic_plain("%s: invalid total packet count %u "
456 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
457 			/* NOTREACHED */
458 			__builtin_unreachable();
459 		} else if (size != m_size) {
460 			panic_plain("%s: invalid total packet size %u "
461 			    "(expected %u)\n", if_name(ifp), size, m_size);
462 			/* NOTREACHED */
463 			__builtin_unreachable();
464 		}
465 
466 		inp->dlth_pkts_cnt += m_cnt;
467 	}
468 #else
469 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
470 #endif /* IFNET_INPUT_SANITY_CHK */
471 
472 	/* NOTE: use the adjusted parameter, vs the original one */
473 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
474 	/*
475 	 * If we're using the main input thread, synchronize the
476 	 * stats now since we have the interface context.  All
477 	 * other cases involving dedicated input threads will
478 	 * have their stats synchronized there.
479 	 */
480 	if (inp == dlil_main_input_thread) {
481 		notify = dlil_input_stats_sync(ifp, inp);
482 	}
483 
484 	dlil_input_wakeup(inp);
485 	lck_mtx_unlock(&inp->dlth_lock);
486 
487 	/*
488 	 * Actual freeing of the excess packets must happen
489 	 * after the dlth_lock had been released.
490 	 */
491 	if (!MBUFQ_EMPTY(&freeq)) {
492 		m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
493 	}
494 
495 	if (notify) {
496 		ifnet_notify_data_threshold(ifp);
497 	}
498 
499 	return 0;
500 }
501 
502 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)503 dlil_input_sync(struct dlil_threading_info *inp,
504     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
505     const struct ifnet_stat_increment_param *s, boolean_t poll,
506     struct thread *tp)
507 {
508 #pragma unused(tp)
509 	u_int32_t m_cnt = s->packets_in;
510 	u_int32_t m_size = s->bytes_in;
511 	boolean_t notify = FALSE;
512 	classq_pkt_t head, tail;
513 	struct ifnet_stat_increment_param s_adj = *s;
514 	dlil_freeq_t freeq;
515 	MBUFQ_INIT(&freeq);
516 
517 	ASSERT(inp != dlil_main_input_thread);
518 
519 	/* XXX: should we just assert instead? */
520 	if (__improbable(m_head == NULL)) {
521 		return 0;
522 	}
523 
524 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
525 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
526 
527 	lck_mtx_lock_spin(&inp->dlth_lock);
528 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
529 
530 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
531 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
532 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
533 		inp->dlth_trim_cnt += 1;
534 
535 		os_log_error(OS_LOG_DEFAULT,
536 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
537 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
538 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
539 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
540 		    qlen(&inp->dlth_pkts));
541 	}
542 
543 #if IFNET_INPUT_SANITY_CHK
544 	if (__improbable(dlil_input_sanity_check != 0)) {
545 		u_int32_t count = 0, size = 0;
546 		struct mbuf *m0;
547 
548 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
549 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
550 			size += m_length(m0);
551 			count++;
552 		}
553 
554 		if (count != m_cnt) {
555 			panic_plain("%s: invalid total packet count %u "
556 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
557 			/* NOTREACHED */
558 			__builtin_unreachable();
559 		} else if (size != m_size) {
560 			panic_plain("%s: invalid total packet size %u "
561 			    "(expected %u)\n", if_name(ifp), size, m_size);
562 			/* NOTREACHED */
563 			__builtin_unreachable();
564 		}
565 
566 		inp->dlth_pkts_cnt += m_cnt;
567 	}
568 #else
569 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
570 #endif /* IFNET_INPUT_SANITY_CHK */
571 
572 	/* NOTE: use the adjusted parameter, vs the original one */
573 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
574 
575 	m_cnt = qlen(&inp->dlth_pkts);
576 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
577 
578 #if SKYWALK
579 	/*
580 	 * If this interface is attached to a netif nexus,
581 	 * the stats are already incremented there; otherwise
582 	 * do it here.
583 	 */
584 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
585 #endif /* SKYWALK */
586 	notify = dlil_input_stats_sync(ifp, inp);
587 
588 	lck_mtx_unlock(&inp->dlth_lock);
589 
590 	/*
591 	 * Actual freeing of the excess packets must happen
592 	 * after the dlth_lock had been released.
593 	 */
594 	if (!MBUFQ_EMPTY(&freeq)) {
595 		m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
596 	}
597 
598 	if (notify) {
599 		ifnet_notify_data_threshold(ifp);
600 	}
601 
602 	/*
603 	 * NOTE warning %%% attention !!!!
604 	 * We should think about putting some thread starvation
605 	 * safeguards if we deal with long chains of packets.
606 	 */
607 	if (head.cp_mbuf != NULL) {
608 		dlil_input_packet_list_extended(ifp, head.cp_mbuf,
609 		    m_cnt, ifp->if_poll_mode);
610 	}
611 
612 	return 0;
613 }
614 
615 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)616 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
617     protocol_family_t pf)
618 {
619 	uint16_t sum = 0;
620 	uint32_t hlen;
621 
622 	if (frame_header == NULL ||
623 	    frame_header < (char *)mbuf_datastart(m) ||
624 	    frame_header > (char *)m->m_data) {
625 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
626 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
627 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
628 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
629 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
630 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
631 		return;
632 	}
633 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
634 
635 	switch (pf) {
636 	case PF_INET:
637 	case PF_INET6:
638 		break;
639 	default:
640 		return;
641 	}
642 
643 	/*
644 	 * Force partial checksum offload; useful to simulate cases
645 	 * where the hardware does not support partial checksum offload,
646 	 * in order to validate correctness throughout the layers above.
647 	 */
648 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
649 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
650 
651 		if (foff > (uint32_t)m->m_pkthdr.len) {
652 			return;
653 		}
654 
655 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
656 
657 		/* Compute 16-bit 1's complement sum from forced offset */
658 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
659 
660 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
661 		m->m_pkthdr.csum_rx_val = sum;
662 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
663 
664 		hwcksum_dbg_partial_forced++;
665 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
666 	}
667 
668 	/*
669 	 * Partial checksum offload verification (and adjustment);
670 	 * useful to validate and test cases where the hardware
671 	 * supports partial checksum offload.
672 	 */
673 	if ((m->m_pkthdr.csum_flags &
674 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
675 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
676 		uint32_t rxoff;
677 
678 		/* Start offset must begin after frame header */
679 		rxoff = m->m_pkthdr.csum_rx_start;
680 		if (hlen > rxoff) {
681 			hwcksum_dbg_bad_rxoff++;
682 			if (dlil_verbose) {
683 				DLIL_PRINTF("%s: partial cksum start offset %d "
684 				    "is less than frame header length %d for "
685 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
686 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
687 			}
688 			return;
689 		}
690 		rxoff -= hlen;
691 
692 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
693 			/*
694 			 * Compute the expected 16-bit 1's complement sum;
695 			 * skip this if we've already computed it above
696 			 * when partial checksum offload is forced.
697 			 */
698 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
699 
700 			/* Hardware or driver is buggy */
701 			if (sum != m->m_pkthdr.csum_rx_val) {
702 				hwcksum_dbg_bad_cksum++;
703 				if (dlil_verbose) {
704 					DLIL_PRINTF("%s: bad partial cksum value "
705 					    "0x%x (expected 0x%x) for mbuf "
706 					    "0x%llx [rx_start %d]\n",
707 					    if_name(ifp),
708 					    m->m_pkthdr.csum_rx_val, sum,
709 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
710 					    m->m_pkthdr.csum_rx_start);
711 				}
712 				return;
713 			}
714 		}
715 		hwcksum_dbg_verified++;
716 
717 		/*
718 		 * This code allows us to emulate various hardwares that
719 		 * perform 16-bit 1's complement sum beginning at various
720 		 * start offset values.
721 		 */
722 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
723 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
724 
725 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
726 				return;
727 			}
728 
729 			sum = m_adj_sum16(m, rxoff, aoff,
730 			    m_pktlen(m) - aoff, sum);
731 
732 			m->m_pkthdr.csum_rx_val = sum;
733 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
734 
735 			hwcksum_dbg_adjusted++;
736 		}
737 	}
738 }
739 
740 #if (DEVELOPMENT || DEBUG)
741 static void
dlil_input_process_wake_packet(ifnet_t ifp,protocol_family_t protocol_family,mbuf_ref_t m)742 dlil_input_process_wake_packet(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m)
743 {
744 	/*
745 	 * For testing we do not care about broadcast and multicast packets as
746 	 * they are not as controllable as unicast traffic
747 	 */
748 	if (check_wake_mbuf(ifp, protocol_family, m) == false) {
749 		return;
750 	}
751 	if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
752 		if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
753 		    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
754 			/*
755 			 * This is a one-shot command
756 			 */
757 			ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
758 
759 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
760 		}
761 	}
762 }
763 #endif /* (DEVELOPMENT || DEBUG) */
764 
765 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,mbuf_ref_t m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)766 dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m,
767     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
768 {
769 	int error = 0;
770 	protocol_family_t protocol_family;
771 	mbuf_t next_packet;
772 	ifnet_t ifp = ifp_param;
773 	char *__single frame_header = NULL;
774 	if_proto_ref_t last_ifproto = NULL;
775 	mbuf_t pkt_first = NULL;
776 	mbuf_t *pkt_next = NULL;
777 	u_int32_t poll_thresh = 0, poll_ival = 0;
778 	int iorefcnt = 0;
779 	boolean_t skip_bridge_filter = FALSE;
780 
781 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
782 
783 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
784 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
785 		poll_thresh = cnt;
786 	}
787 	if (bridge_enable_early_input != 0 &&
788 	    ifp != NULL && ifp->if_bridge != NULL) {
789 		m = handle_bridge_early_input(ifp, m, cnt);
790 		skip_bridge_filter = TRUE;
791 	}
792 	while (m != NULL) {
793 		if_proto_ref_t ifproto = NULL;
794 		uint32_t pktf_mask;     /* pkt flags to preserve */
795 
796 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
797 		m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
798 
799 		if (ifp_param == NULL) {
800 			ifp = m->m_pkthdr.rcvif;
801 		}
802 
803 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
804 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
805 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
806 			ifnet_poll(ifp);
807 		}
808 
809 		/* Check if this mbuf looks valid */
810 		MBUF_INPUT_CHECK(m, ifp);
811 
812 		next_packet = m->m_nextpkt;
813 		m->m_nextpkt = NULL;
814 		frame_header = m->m_pkthdr.pkt_hdr;
815 		m->m_pkthdr.pkt_hdr = NULL;
816 
817 		/*
818 		 * Get an IO reference count if the interface is not
819 		 * loopback (lo0) and it is attached; lo0 never goes
820 		 * away, so optimize for that.
821 		 */
822 		if (ifp != lo_ifp) {
823 			/* iorefcnt is 0 if it hasn't been taken yet */
824 			if (iorefcnt == 0) {
825 				if (!ifnet_datamov_begin(ifp)) {
826 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_DATAMOV_BEGIN, NULL, 0);
827 					goto next;
828 				}
829 			}
830 			iorefcnt = 1;
831 			/*
832 			 * Preserve the time stamp and skip pktap flags.
833 			 */
834 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
835 		} else {
836 			/*
837 			 * If this arrived on lo0, preserve interface addr
838 			 * info to allow for connectivity between loopback
839 			 * and local interface addresses.
840 			 */
841 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
842 		}
843 		pktf_mask |= PKTF_WAKE_PKT;
844 
845 		/* make sure packet comes in clean */
846 		m_classifier_init(m, pktf_mask);
847 
848 		ifp_inc_traffic_class_in(ifp, m);
849 
850 		/* find which protocol family this packet is for */
851 		ifnet_lock_shared(ifp);
852 		error = (*ifp->if_demux)(ifp, m, frame_header,
853 		    &protocol_family);
854 		ifnet_lock_done(ifp);
855 		if (error != 0) {
856 			if (error == EJUSTRETURN) {
857 				goto next;
858 			}
859 			protocol_family = 0;
860 		}
861 		/* check for an updated frame header */
862 		if (m->m_pkthdr.pkt_hdr != NULL) {
863 			frame_header = m->m_pkthdr.pkt_hdr;
864 			m->m_pkthdr.pkt_hdr = NULL;
865 		}
866 
867 #if (DEVELOPMENT || DEBUG)
868 		/* For testing only */
869 		dlil_input_process_wake_packet(ifp, protocol_family, m);
870 #endif /* (DEVELOPMENT || DEBUG) */
871 
872 		pktap_input(ifp, protocol_family, m, frame_header);
873 
874 		/* Drop v4 packets received on CLAT46 enabled cell interface */
875 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
876 		    ifp->if_type == IFT_CELLULAR) {
877 			m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
878 			ip6stat.ip6s_clat464_in_v4_drop++;
879 			goto next;
880 		}
881 
882 		/* Translate the packet if it is received on CLAT interface */
883 		if ((m->m_flags & M_PROMISC) == 0 &&
884 		    protocol_family == PF_INET6 &&
885 		    IS_INTF_CLAT46(ifp) &&
886 		    dlil_is_clat_needed(protocol_family, m)) {
887 			char *data = NULL;
888 			struct ether_header eh;
889 			struct ether_header *ehp = NULL;
890 
891 			if (ifp->if_type == IFT_ETHER) {
892 				ehp = (struct ether_header *)(void *)frame_header;
893 				/* Skip RX Ethernet packets if they are not IPV6 */
894 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
895 					goto skip_clat;
896 				}
897 
898 				/* Keep a copy of frame_header for Ethernet packets */
899 				char *fh = __unsafe_forge_bidi_indexable(char *, m->m_pkthdr.pkt_hdr, ifnet_hdrlen(ifp));
900 				if (fh) {
901 					bcopy(fh, (caddr_t)&eh, ETHER_HDR_LEN);
902 				}
903 			}
904 			error = dlil_clat64(ifp, &protocol_family, &m);
905 			data = mtod(m, char*);
906 			if (error != 0) {
907 				m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
908 				ip6stat.ip6s_clat464_in_drop++;
909 				goto next;
910 			}
911 			/* Native v6 should be No-op */
912 			if (protocol_family != PF_INET) {
913 				goto skip_clat;
914 			}
915 
916 			/* Do this only for translated v4 packets. */
917 			switch (ifp->if_type) {
918 			case IFT_CELLULAR:
919 				frame_header = data;
920 				break;
921 			case IFT_ETHER:
922 				/*
923 				 * Drop if the mbuf doesn't have enough
924 				 * space for Ethernet header
925 				 */
926 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
927 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
928 					ip6stat.ip6s_clat464_in_drop++;
929 					goto next;
930 				}
931 				/*
932 				 * Set the frame_header ETHER_HDR_LEN bytes
933 				 * preceeding the data pointer. Change
934 				 * the ether_type too.
935 				 * N.B. The variable `fh' is needed because
936 				 * the `frame_header' variable is `__single',
937 				 * and hence would not be appropriate for use with `bcopy'.
938 				 */
939 				char *fh = data - ETHER_HDR_LEN;
940 				frame_header = fh;
941 				eh.ether_type = htons(ETHERTYPE_IP);
942 				bcopy((caddr_t)&eh, fh, ETHER_HDR_LEN);
943 				break;
944 			}
945 		}
946 skip_clat:
947 		/*
948 		 * Match the wake packet against the list of ports that has been
949 		 * been queried by the driver before the device went to sleep
950 		 */
951 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
952 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
953 				if_ports_used_match_mbuf(ifp, protocol_family, m);
954 			}
955 		}
956 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
957 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
958 			dlil_input_cksum_dbg(ifp, m, frame_header,
959 			    protocol_family);
960 		}
961 		/*
962 		 * For partial checksum offload, we expect the driver to
963 		 * set the start offset indicating the start of the span
964 		 * that is covered by the hardware-computed checksum;
965 		 * adjust this start offset accordingly because the data
966 		 * pointer has been advanced beyond the link-layer header.
967 		 *
968 		 * Virtual lan types (bridge, vlan, bond) can call
969 		 * dlil_input_packet_list() with the same packet with the
970 		 * checksum flags set. Set a flag indicating that the
971 		 * adjustment has already been done.
972 		 */
973 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
974 			/* adjustment has already been done */
975 		} else if ((m->m_pkthdr.csum_flags &
976 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
977 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
978 			int adj;
979 			if (frame_header == NULL ||
980 			    frame_header < (char *)mbuf_datastart(m) ||
981 			    frame_header > (char *)m->m_data ||
982 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
983 			    m->m_pkthdr.csum_rx_start) {
984 				m->m_pkthdr.csum_data = 0;
985 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
986 				hwcksum_in_invalidated++;
987 			} else {
988 				m->m_pkthdr.csum_rx_start -= adj;
989 			}
990 			/* make sure we don't adjust more than once */
991 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
992 		}
993 		if (clat_debug) {
994 			pktap_input(ifp, protocol_family, m, frame_header);
995 		}
996 
997 		if (m->m_flags & (M_BCAST | M_MCAST)) {
998 			os_atomic_inc(&ifp->if_imcasts, relaxed);
999 		}
1000 
1001 		/* run interface filters */
1002 		error = dlil_interface_filters_input(ifp, &m,
1003 		    &frame_header, protocol_family, skip_bridge_filter);
1004 		if (error != 0) {
1005 			if (error != EJUSTRETURN) {
1006 				m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
1007 			}
1008 			goto next;
1009 		}
1010 		/*
1011 		 * A VLAN and Bond interface receives packets by attaching
1012 		 * a "protocol" to the underlying interface.
1013 		 * A promiscuous packet needs to be delivered to the
1014 		 * VLAN or Bond interface since:
1015 		 * - Bond interface member may not support setting the
1016 		 *   MAC address, so packets are inherently "promiscuous"
1017 		 * - A VLAN or Bond interface could be members of a bridge,
1018 		 *   where promiscuous packets correspond to other
1019 		 *   devices that the bridge forwards packets to/from
1020 		 */
1021 		if ((m->m_flags & M_PROMISC) != 0) {
1022 			switch (protocol_family) {
1023 			case PF_VLAN:
1024 			case PF_BOND:
1025 				/* VLAN and Bond get promiscuous packets */
1026 				break;
1027 			default:
1028 				if (droptap_verbose > 0) {
1029 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_PROMISC, NULL, 0);
1030 				} else {
1031 					m_freem(m);
1032 				}
1033 				goto next;
1034 			}
1035 		}
1036 
1037 		/* Lookup the protocol attachment to this interface */
1038 		if (protocol_family == 0) {
1039 			ifproto = NULL;
1040 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
1041 		    (last_ifproto->protocol_family == protocol_family)) {
1042 			VERIFY(ifproto == NULL);
1043 			ifproto = last_ifproto;
1044 			if_proto_ref(last_ifproto);
1045 		} else {
1046 			VERIFY(ifproto == NULL);
1047 			ifnet_lock_shared(ifp);
1048 			/* callee holds a proto refcnt upon success */
1049 			ifproto = find_attached_proto(ifp, protocol_family);
1050 			ifnet_lock_done(ifp);
1051 		}
1052 		if (ifproto == NULL) {
1053 			/* no protocol for this packet, discard */
1054 			m_drop_extended(m, ifp, frame_header, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0);
1055 			goto next;
1056 		}
1057 		if (ifproto != last_ifproto) {
1058 			if (last_ifproto != NULL) {
1059 				/* pass up the list for the previous protocol */
1060 				dlil_ifproto_input(last_ifproto, pkt_first);
1061 				pkt_first = NULL;
1062 				if_proto_free(last_ifproto);
1063 			}
1064 			last_ifproto = ifproto;
1065 			if_proto_ref(ifproto);
1066 		}
1067 		/* extend the list */
1068 		m->m_pkthdr.pkt_hdr = frame_header;
1069 		if (pkt_first == NULL) {
1070 			pkt_first = m;
1071 		} else {
1072 			*pkt_next = m;
1073 		}
1074 		pkt_next = &m->m_nextpkt;
1075 
1076 next:
1077 		if (next_packet == NULL && last_ifproto != NULL) {
1078 			/* pass up the last list of packets */
1079 			dlil_ifproto_input(last_ifproto, pkt_first);
1080 			if_proto_free(last_ifproto);
1081 			last_ifproto = NULL;
1082 		}
1083 		if (ifproto != NULL) {
1084 			if_proto_free(ifproto);
1085 			ifproto = NULL;
1086 		}
1087 
1088 		m = next_packet;
1089 
1090 		/* update the driver's multicast filter, if needed */
1091 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
1092 			ifp->if_updatemcasts = 0;
1093 		}
1094 		if (iorefcnt == 1) {
1095 			/* If the next mbuf is on a different interface, unlock data-mov */
1096 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
1097 				ifnet_datamov_end(ifp);
1098 				iorefcnt = 0;
1099 			}
1100 		}
1101 	}
1102 
1103 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
1104 }
1105 
1106 /*
1107  * Input thread for interfaces with legacy input model.
1108  */
1109 __attribute__((noreturn))
1110 static void
dlil_input_thread_func(void * v,wait_result_t w)1111 dlil_input_thread_func(void *v, wait_result_t w)
1112 {
1113 #pragma unused(w)
1114 	char thread_name_storage[MAXTHREADNAMESIZE];
1115 	const char *__null_terminated thread_name;
1116 	dlil_threading_info_ref_t inp = v;
1117 	ifnet_ref_t ifp = inp->dlth_ifp;
1118 
1119 	VERIFY(inp != dlil_main_input_thread);
1120 	VERIFY(ifp != NULL);
1121 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
1122 	    !(ifp->if_xflags & IFXF_LEGACY));
1123 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
1124 	    !(ifp->if_xflags & IFXF_LEGACY));
1125 	VERIFY(current_thread() == inp->dlth_thread);
1126 
1127 	/* construct the name for this thread, and then apply it */
1128 	bzero(thread_name_storage, sizeof(thread_name_storage));
1129 	thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1130 	    "dlil_input_%s", ifp->if_xname);
1131 	thread_set_thread_name(inp->dlth_thread, thread_name);
1132 
1133 #if CONFIG_THREAD_GROUPS
1134 	if (IFNET_REQUIRES_CELL_GROUP(ifp)) {
1135 		thread_group_join_cellular();
1136 	}
1137 #endif /* CONFIG_THREAD_GROUPS */
1138 
1139 	lck_mtx_lock(&inp->dlth_lock);
1140 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1141 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1142 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1143 	/* wake up once to get out of embryonic state */
1144 	dlil_input_wakeup(inp);
1145 	lck_mtx_unlock(&inp->dlth_lock);
1146 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
1147 	/* NOTREACHED */
1148 	__builtin_unreachable();
1149 }
1150 
1151 __attribute__((noreturn))
1152 static void
dlil_input_thread_cont(void * v,wait_result_t wres)1153 dlil_input_thread_cont(void *v, wait_result_t wres)
1154 {
1155 	dlil_threading_info_ref_t inp = v;
1156 	ifnet_ref_t ifp = inp->dlth_ifp;
1157 
1158 	lck_mtx_lock_spin(&inp->dlth_lock);
1159 	if (__improbable(wres == THREAD_INTERRUPTED ||
1160 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1161 		goto terminate;
1162 	}
1163 
1164 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1165 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1166 
1167 	while (1) {
1168 		struct mbuf *m = NULL;
1169 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1170 		boolean_t notify = FALSE;
1171 		boolean_t embryonic;
1172 		u_int32_t m_cnt;
1173 
1174 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1175 
1176 		if (__improbable(embryonic =
1177 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1178 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1179 		}
1180 
1181 		/*
1182 		 * Protocol registration and injection must always use
1183 		 * the main input thread; in theory the latter can utilize
1184 		 * the corresponding input thread where the packet arrived
1185 		 * on, but that requires our knowing the interface in advance
1186 		 * (and the benefits might not worth the trouble.)
1187 		 */
1188 		VERIFY(!(inp->dlth_flags &
1189 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1190 
1191 		/* Packets for this interface */
1192 		m_cnt = qlen(&inp->dlth_pkts);
1193 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1194 		m = pkt.cp_mbuf;
1195 
1196 		inp->dlth_wtot = 0;
1197 
1198 #if SKYWALK
1199 		/*
1200 		 * If this interface is attached to a netif nexus,
1201 		 * the stats are already incremented there; otherwise
1202 		 * do it here.
1203 		 */
1204 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
1205 #endif /* SKYWALK */
1206 		notify = dlil_input_stats_sync(ifp, inp);
1207 
1208 		lck_mtx_unlock(&inp->dlth_lock);
1209 
1210 		if (__improbable(embryonic)) {
1211 			ifnet_decr_pending_thread_count(ifp);
1212 		}
1213 
1214 		if (__improbable(notify)) {
1215 			ifnet_notify_data_threshold(ifp);
1216 		}
1217 
1218 		/*
1219 		 * NOTE warning %%% attention !!!!
1220 		 * We should think about putting some thread starvation
1221 		 * safeguards if we deal with long chains of packets.
1222 		 */
1223 		if (__probable(m != NULL)) {
1224 			dlil_input_packet_list_extended(ifp, m,
1225 			    m_cnt, ifp->if_poll_mode);
1226 		}
1227 
1228 		lck_mtx_lock_spin(&inp->dlth_lock);
1229 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1230 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1231 		    DLIL_INPUT_TERMINATE))) {
1232 			break;
1233 		}
1234 	}
1235 
1236 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1237 
1238 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1239 terminate:
1240 		lck_mtx_unlock(&inp->dlth_lock);
1241 		dlil_terminate_input_thread(inp);
1242 		/* NOTREACHED */
1243 	} else {
1244 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1245 		lck_mtx_unlock(&inp->dlth_lock);
1246 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
1247 		/* NOTREACHED */
1248 	}
1249 
1250 	VERIFY(0);      /* we should never get here */
1251 	/* NOTREACHED */
1252 	__builtin_unreachable();
1253 }
1254 
1255 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)1256 dlil_input_wakeup(struct dlil_threading_info *inp)
1257 {
1258 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
1259 
1260 	inp->dlth_flags |= DLIL_INPUT_WAITING;
1261 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
1262 		inp->dlth_wtot++;
1263 		wakeup_one((caddr_t)&inp->dlth_flags);
1264 	}
1265 }
1266 
1267 static int
dlil_interface_filters_input(struct ifnet * ifp,mbuf_ref_ref_t m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)1268 dlil_interface_filters_input(struct ifnet *ifp, mbuf_ref_ref_t m_p,
1269     char **frame_header_p, protocol_family_t protocol_family,
1270     boolean_t skip_bridge)
1271 {
1272 	boolean_t               is_vlan_packet = FALSE;
1273 	struct ifnet_filter     *filter;
1274 	struct mbuf             *m = *m_p;
1275 
1276 	is_vlan_packet = packet_has_vlan_tag(m);
1277 
1278 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
1279 		return 0;
1280 	}
1281 
1282 	/*
1283 	 * Pass the inbound packet to the interface filters
1284 	 */
1285 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1286 	/* prevent filter list from changing in case we drop the lock */
1287 	if_flt_monitor_busy(ifp);
1288 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
1289 		int result;
1290 
1291 		/* exclude VLAN packets from external filters PR-3586856 */
1292 		if (is_vlan_packet &&
1293 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
1294 			continue;
1295 		}
1296 		/* the bridge has already seen the packet */
1297 		if (skip_bridge &&
1298 		    (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
1299 			continue;
1300 		}
1301 		if (!filter->filt_skip && filter->filt_input != NULL &&
1302 		    (filter->filt_protocol == 0 ||
1303 		    filter->filt_protocol == protocol_family)) {
1304 			lck_mtx_unlock(&ifp->if_flt_lock);
1305 
1306 			result = (*filter->filt_input)(filter->filt_cookie,
1307 			    ifp, protocol_family, m_p, frame_header_p);
1308 
1309 			lck_mtx_lock_spin(&ifp->if_flt_lock);
1310 			if (result != 0) {
1311 				/* we're done with the filter list */
1312 				if_flt_monitor_unbusy(ifp);
1313 				lck_mtx_unlock(&ifp->if_flt_lock);
1314 				return result;
1315 			}
1316 		}
1317 	}
1318 	/* we're done with the filter list */
1319 	if_flt_monitor_unbusy(ifp);
1320 	lck_mtx_unlock(&ifp->if_flt_lock);
1321 
1322 	/*
1323 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
1324 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
1325 	 */
1326 	if (*m_p != NULL) {
1327 		(*m_p)->m_flags &= ~M_PROTO1;
1328 	}
1329 
1330 	return 0;
1331 }
1332 
1333 __attribute__((noreturn))
1334 static void
dlil_main_input_thread_func(void * v,wait_result_t w)1335 dlil_main_input_thread_func(void *v, wait_result_t w)
1336 {
1337 #pragma unused(w)
1338 	dlil_threading_info_ref_t inp = v;
1339 
1340 	VERIFY(inp == dlil_main_input_thread);
1341 	VERIFY(inp->dlth_ifp == NULL);
1342 	VERIFY(current_thread() == inp->dlth_thread);
1343 
1344 	lck_mtx_lock(&inp->dlth_lock);
1345 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1346 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1347 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1348 	/* wake up once to get out of embryonic state */
1349 	dlil_input_wakeup(inp);
1350 	lck_mtx_unlock(&inp->dlth_lock);
1351 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1352 	/* NOTREACHED */
1353 	__builtin_unreachable();
1354 }
1355 
1356 /*
1357  * Main input thread:
1358  *
1359  *   a) handles all inbound packets for lo0
1360  *   b) handles all inbound packets for interfaces with no dedicated
1361  *	input thread (e.g. anything but Ethernet/PDP or those that support
1362  *	opportunistic polling.)
1363  *   c) protocol registrations
1364  *   d) packet injections
1365  */
1366 __attribute__((noreturn))
1367 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)1368 dlil_main_input_thread_cont(void *v, wait_result_t wres)
1369 {
1370 	dlil_main_threading_info_ref_t inpm = v;
1371 	dlil_threading_info_ref_t inp = v;
1372 
1373 	/* main input thread is uninterruptible */
1374 	VERIFY(wres != THREAD_INTERRUPTED);
1375 	lck_mtx_lock_spin(&inp->dlth_lock);
1376 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
1377 	    DLIL_INPUT_RUNNING)));
1378 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1379 
1380 	while (1) {
1381 		struct mbuf *m = NULL, *m_loop = NULL;
1382 		u_int32_t m_cnt, m_cnt_loop;
1383 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1384 		boolean_t proto_req;
1385 		boolean_t embryonic;
1386 
1387 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1388 
1389 		if (__improbable(embryonic =
1390 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1391 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1392 		}
1393 
1394 		proto_req = (inp->dlth_flags &
1395 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1396 
1397 		/* Packets for non-dedicated interfaces other than lo0 */
1398 		m_cnt = qlen(&inp->dlth_pkts);
1399 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1400 		m = pkt.cp_mbuf;
1401 
1402 		/* Packets exclusive to lo0 */
1403 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1404 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
1405 		m_loop = pkt.cp_mbuf;
1406 
1407 		inp->dlth_wtot = 0;
1408 
1409 		lck_mtx_unlock(&inp->dlth_lock);
1410 
1411 		if (__improbable(embryonic)) {
1412 			dlil_decr_pending_thread_count();
1413 		}
1414 
1415 		/*
1416 		 * NOTE warning %%% attention !!!!
1417 		 * We should think about putting some thread starvation
1418 		 * safeguards if we deal with long chains of packets.
1419 		 */
1420 		if (__probable(m_loop != NULL)) {
1421 			dlil_input_packet_list_extended(lo_ifp, m_loop,
1422 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
1423 		}
1424 
1425 		if (__probable(m != NULL)) {
1426 			dlil_input_packet_list_extended(NULL, m,
1427 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
1428 		}
1429 
1430 		if (__improbable(proto_req)) {
1431 			proto_input_run();
1432 		}
1433 
1434 		lck_mtx_lock_spin(&inp->dlth_lock);
1435 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1436 		/* main input thread cannot be terminated */
1437 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
1438 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
1439 			break;
1440 		}
1441 	}
1442 
1443 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1444 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1445 	lck_mtx_unlock(&inp->dlth_lock);
1446 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1447 
1448 	VERIFY(0);      /* we should never get here */
1449 	/* NOTREACHED */
1450 	__builtin_unreachable();
1451 }
1452 
1453 /*
1454  * Input thread for interfaces with opportunistic polling input model.
1455  */
1456 __attribute__((noreturn))
1457 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)1458 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1459 {
1460 #pragma unused(w)
1461 	char thread_name_storage[MAXTHREADNAMESIZE];
1462 	const char *__null_terminated thread_name;
1463 	dlil_threading_info_ref_t inp = v;
1464 	ifnet_ref_t ifp = inp->dlth_ifp;
1465 
1466 	VERIFY(inp != dlil_main_input_thread);
1467 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
1468 	    (ifp->if_xflags & IFXF_LEGACY));
1469 	VERIFY(current_thread() == inp->dlth_thread);
1470 
1471 	/* construct the name for this thread, and then apply it */
1472 	bzero(thread_name_storage, sizeof(thread_name_storage));
1473 	thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1474 	    "dlil_input_poll_%s", ifp->if_xname);
1475 	thread_set_thread_name(inp->dlth_thread, thread_name);
1476 
1477 	lck_mtx_lock(&inp->dlth_lock);
1478 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1479 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1480 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1481 	/* wake up once to get out of embryonic state */
1482 	dlil_input_wakeup(inp);
1483 	lck_mtx_unlock(&inp->dlth_lock);
1484 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
1485 	/* NOTREACHED */
1486 	__builtin_unreachable();
1487 }
1488 
1489 __attribute__((noreturn))
1490 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)1491 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
1492 {
1493 	dlil_threading_info_ref_t inp = v;
1494 	ifnet_ref_t ifp = inp->dlth_ifp;
1495 	struct timespec ts;
1496 
1497 	lck_mtx_lock_spin(&inp->dlth_lock);
1498 	if (__improbable(wres == THREAD_INTERRUPTED ||
1499 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1500 		goto terminate;
1501 	}
1502 
1503 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1504 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1505 
1506 	while (1) {
1507 		struct mbuf *m = NULL;
1508 		uint32_t m_cnt, poll_req = 0;
1509 		uint64_t m_size = 0;
1510 		ifnet_model_t mode;
1511 		struct timespec now, delta;
1512 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1513 		boolean_t notify;
1514 		boolean_t embryonic;
1515 		uint64_t ival;
1516 
1517 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1518 
1519 		if (__improbable(embryonic =
1520 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1521 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1522 			goto skip;
1523 		}
1524 
1525 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
1526 			ival = IF_RXPOLL_INTERVALTIME_MIN;
1527 		}
1528 
1529 		/* Link parameters changed? */
1530 		if (ifp->if_poll_update != 0) {
1531 			ifp->if_poll_update = 0;
1532 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1533 		}
1534 
1535 		/* Current operating mode */
1536 		mode = ifp->if_poll_mode;
1537 
1538 		/*
1539 		 * Protocol registration and injection must always use
1540 		 * the main input thread; in theory the latter can utilize
1541 		 * the corresponding input thread where the packet arrived
1542 		 * on, but that requires our knowing the interface in advance
1543 		 * (and the benefits might not worth the trouble.)
1544 		 */
1545 		VERIFY(!(inp->dlth_flags &
1546 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1547 
1548 		/* Total count of all packets */
1549 		m_cnt = qlen(&inp->dlth_pkts);
1550 
1551 		/* Total bytes of all packets */
1552 		m_size = qsize(&inp->dlth_pkts);
1553 
1554 		/* Packets for this interface */
1555 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1556 		m = pkt.cp_mbuf;
1557 		VERIFY(m != NULL || m_cnt == 0);
1558 
1559 		nanouptime(&now);
1560 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
1561 			*(&ifp->if_poll_sample_lasttime) = *(&now);
1562 		}
1563 
1564 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
1565 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
1566 			u_int32_t ptot, btot;
1567 
1568 			/* Accumulate statistics for current sampling */
1569 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
1570 
1571 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
1572 				goto skip;
1573 			}
1574 
1575 			*(&ifp->if_poll_sample_lasttime) = *(&now);
1576 
1577 			/* Calculate min/max of inbound bytes */
1578 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
1579 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
1580 				ifp->if_rxpoll_bmin = btot;
1581 			}
1582 			if (btot > ifp->if_rxpoll_bmax) {
1583 				ifp->if_rxpoll_bmax = btot;
1584 			}
1585 
1586 			/* Calculate EWMA of inbound bytes */
1587 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
1588 
1589 			/* Calculate min/max of inbound packets */
1590 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
1591 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
1592 				ifp->if_rxpoll_pmin = ptot;
1593 			}
1594 			if (ptot > ifp->if_rxpoll_pmax) {
1595 				ifp->if_rxpoll_pmax = ptot;
1596 			}
1597 
1598 			/* Calculate EWMA of inbound packets */
1599 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
1600 
1601 			/* Reset sampling statistics */
1602 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
1603 
1604 			/* Calculate EWMA of wakeup requests */
1605 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
1606 			    if_rxpoll_decay);
1607 			inp->dlth_wtot = 0;
1608 
1609 			if (dlil_verbose) {
1610 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
1611 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
1612 				}
1613 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
1614 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1615 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
1616 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
1617 					    "limits [%d/%d], wreq avg %d "
1618 					    "limits [%d/%d], bytes avg %d "
1619 					    "limits [%d/%d]\n", if_name(ifp),
1620 					    (ifp->if_poll_mode ==
1621 					    IFNET_MODEL_INPUT_POLL_ON) ?
1622 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
1623 					    ifp->if_rxpoll_pmax,
1624 					    ifp->if_rxpoll_plowat,
1625 					    ifp->if_rxpoll_phiwat,
1626 					    ifp->if_rxpoll_wavg,
1627 					    ifp->if_rxpoll_wlowat,
1628 					    ifp->if_rxpoll_whiwat,
1629 					    ifp->if_rxpoll_bavg,
1630 					    ifp->if_rxpoll_blowat,
1631 					    ifp->if_rxpoll_bhiwat);
1632 				}
1633 			}
1634 
1635 			/* Perform mode transition, if necessary */
1636 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
1637 				*(&ifp->if_poll_mode_lasttime) = *(&now);
1638 			}
1639 
1640 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
1641 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
1642 				goto skip;
1643 			}
1644 
1645 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
1646 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
1647 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
1648 				mode = IFNET_MODEL_INPUT_POLL_OFF;
1649 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
1650 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
1651 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
1652 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
1653 				mode = IFNET_MODEL_INPUT_POLL_ON;
1654 			}
1655 
1656 			if (mode != ifp->if_poll_mode) {
1657 				ifp->if_poll_mode = mode;
1658 				*(&ifp->if_poll_mode_lasttime) = *(&now);
1659 				poll_req++;
1660 			}
1661 		}
1662 skip:
1663 		notify = dlil_input_stats_sync(ifp, inp);
1664 
1665 		lck_mtx_unlock(&inp->dlth_lock);
1666 
1667 		if (__improbable(embryonic)) {
1668 			ifnet_decr_pending_thread_count(ifp);
1669 		}
1670 
1671 		if (__improbable(notify)) {
1672 			ifnet_notify_data_threshold(ifp);
1673 		}
1674 
1675 		/*
1676 		 * If there's a mode change and interface is still attached,
1677 		 * perform a downcall to the driver for the new mode.  Also
1678 		 * hold an IO refcnt on the interface to prevent it from
1679 		 * being detached (will be release below.)
1680 		 */
1681 		if (poll_req != 0 && ifnet_get_ioref(ifp)) {
1682 			struct ifnet_model_params p = {
1683 				.model = mode, .reserved = { 0 }
1684 			};
1685 			errno_t err;
1686 
1687 			if (dlil_verbose) {
1688 				DLIL_PRINTF("%s: polling is now %s, "
1689 				    "pkts avg %d max %d limits [%d/%d], "
1690 				    "wreq avg %d limits [%d/%d], "
1691 				    "bytes avg %d limits [%d/%d]\n",
1692 				    if_name(ifp),
1693 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1694 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
1695 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
1696 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
1697 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
1698 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
1699 				    ifp->if_rxpoll_bhiwat);
1700 			}
1701 
1702 			if ((err = ((*ifp->if_input_ctl)(ifp,
1703 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
1704 				DLIL_PRINTF("%s: error setting polling mode "
1705 				    "to %s (%d)\n", if_name(ifp),
1706 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1707 				    "ON" : "OFF", err);
1708 			}
1709 
1710 			switch (mode) {
1711 			case IFNET_MODEL_INPUT_POLL_OFF:
1712 				ifnet_set_poll_cycle(ifp, NULL);
1713 				ifp->if_rxpoll_offreq++;
1714 				if (err != 0) {
1715 					ifp->if_rxpoll_offerr++;
1716 				}
1717 				break;
1718 
1719 			case IFNET_MODEL_INPUT_POLL_ON:
1720 				net_nsectimer(&ival, &ts);
1721 				ifnet_set_poll_cycle(ifp, &ts);
1722 				ifnet_poll(ifp);
1723 				ifp->if_rxpoll_onreq++;
1724 				if (err != 0) {
1725 					ifp->if_rxpoll_onerr++;
1726 				}
1727 				break;
1728 
1729 			default:
1730 				VERIFY(0);
1731 				/* NOTREACHED */
1732 			}
1733 
1734 			/* Release the IO refcnt */
1735 			ifnet_decr_iorefcnt(ifp);
1736 		}
1737 
1738 		/*
1739 		 * NOTE warning %%% attention !!!!
1740 		 * We should think about putting some thread starvation
1741 		 * safeguards if we deal with long chains of packets.
1742 		 */
1743 		if (__probable(m != NULL)) {
1744 			dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
1745 		}
1746 
1747 		lck_mtx_lock_spin(&inp->dlth_lock);
1748 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1749 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1750 		    DLIL_INPUT_TERMINATE))) {
1751 			break;
1752 		}
1753 	}
1754 
1755 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1756 
1757 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1758 terminate:
1759 		lck_mtx_unlock(&inp->dlth_lock);
1760 		dlil_terminate_input_thread(inp);
1761 		/* NOTREACHED */
1762 	} else {
1763 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1764 		lck_mtx_unlock(&inp->dlth_lock);
1765 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
1766 		    inp);
1767 		/* NOTREACHED */
1768 	}
1769 
1770 	VERIFY(0);      /* we should never get here */
1771 	/* NOTREACHED */
1772 	__builtin_unreachable();
1773 }
1774 
1775 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)1776 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
1777     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
1778 {
1779 	uint32_t overcommitted_qlen;    /* Length in packets. */
1780 	uint64_t overcommitted_qsize;   /* Size in bytes. */
1781 	uint32_t target_qlen;           /* The desired queue length after trimming. */
1782 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
1783 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
1784 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
1785 	struct mbuf *m = NULL, *m_tmp = NULL;
1786 
1787 	overcommitted_qlen = qlen(input_queue);
1788 	overcommitted_qsize = qsize(input_queue);
1789 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
1790 
1791 	if (overcommitted_qlen <= target_qlen) {
1792 		/*
1793 		 * The queue is already within the target limits.
1794 		 */
1795 		dropped_pkts = 0;
1796 		goto out;
1797 	}
1798 
1799 	pkts_to_drop = overcommitted_qlen - target_qlen;
1800 
1801 	/*
1802 	 * Proceed to removing packets from the head of the queue,
1803 	 * starting from the oldest, until the desired number of packets
1804 	 * has been dropped.
1805 	 */
1806 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
1807 		if (pkts_to_drop <= dropped_pkts) {
1808 			break;
1809 		}
1810 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
1811 		MBUFQ_NEXT(m) = NULL;
1812 		MBUFQ_ENQUEUE(freeq, m);
1813 
1814 		dropped_pkts += 1;
1815 		dropped_bytes += m_length(m);
1816 	}
1817 
1818 	/*
1819 	 * Adjust the length and the estimated size of the queue
1820 	 * after trimming.
1821 	 */
1822 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
1823 	qlen(input_queue) = target_qlen;
1824 
1825 	/* qsize() is an approximation. */
1826 	if (dropped_bytes < qsize(input_queue)) {
1827 		qsize(input_queue) -= dropped_bytes;
1828 	} else {
1829 		qsize(input_queue) = 0;
1830 	}
1831 
1832 	/*
1833 	 * Adjust the ifnet statistics increments, if needed.
1834 	 */
1835 	stat_delta->dropped += dropped_pkts;
1836 	if (dropped_pkts < stat_delta->packets_in) {
1837 		stat_delta->packets_in -= dropped_pkts;
1838 	} else {
1839 		stat_delta->packets_in = 0;
1840 	}
1841 	if (dropped_bytes < stat_delta->bytes_in) {
1842 		stat_delta->bytes_in -= dropped_bytes;
1843 	} else {
1844 		stat_delta->bytes_in = 0;
1845 	}
1846 
1847 out:
1848 	if (dlil_verbose) {
1849 		/*
1850 		 * The basic information about the drop is logged
1851 		 * by the invoking function (dlil_input_{,a}sync).
1852 		 * If `dlil_verbose' flag is set, provide more information
1853 		 * that can be useful for debugging.
1854 		 */
1855 		DLIL_PRINTF("%s: "
1856 		    "qlen: %u -> %u, "
1857 		    "qsize: %llu -> %llu "
1858 		    "qlimit: %u (sysctl: %u) "
1859 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
1860 		    "dropped_pkts: %u dropped_bytes %u\n",
1861 		    __func__,
1862 		    overcommitted_qlen, qlen(input_queue),
1863 		    overcommitted_qsize, qsize(input_queue),
1864 		    qlimit(input_queue), if_rcvq_burst_limit,
1865 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
1866 		    dropped_pkts, dropped_bytes);
1867 	}
1868 
1869 	return dropped_pkts;
1870 }
1871 
1872 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)1873 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
1874 {
1875 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1876 	if_flt_monitor_busy(ifp);
1877 	lck_mtx_unlock(&ifp->if_flt_lock);
1878 
1879 	if (ifp->if_bridge != NULL) {
1880 		m = bridge_early_input(ifp, m, cnt);
1881 	}
1882 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1883 	if_flt_monitor_unbusy(ifp);
1884 	lck_mtx_unlock(&ifp->if_flt_lock);
1885 	return m;
1886 }
1887