xref: /xnu-11417.140.69/bsd/net/dlil_input.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <net/if_var.h>
30 #include <net/dlil_var_private.h>
31 #include <net/dlil.h>
32 #include <net/dlil_sysctl.h>
33 
34 
35 #define DLIL_EWMA(old, new, decay) do {                                 \
36 	u_int32_t _avg;                                                 \
37 	if ((_avg = (old)) > 0)                                         \
38 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
39 	else                                                            \
40 	        _avg = (new);                                           \
41 	(old) = _avg;                                                   \
42 } while (0)
43 
44 
45 /*
46  * Detect whether a queue contains a burst that needs to be trimmed.
47  */
48 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
49 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
50 	                        qtype(q) == QP_MBUF)
51 
52 
53 /* rate limit debug messages */
54 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
55 
56 extern void proto_input_run(void);
57 
58 static errno_t dlil_input_async(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
59 static errno_t dlil_input_sync(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
60 static void dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, protocol_family_t pf);
61 static void dlil_input_packet_list_common(struct ifnet *, mbuf_ref_t, u_int32_t, ifnet_model_t, boolean_t);
62 static void dlil_input_thread_func(void *, wait_result_t);
63 static void dlil_input_thread_cont(void *, wait_result_t);
64 static inline void dlil_input_wakeup(struct dlil_threading_info *inp);
65 
66 static int dlil_interface_filters_input(struct ifnet *, mbuf_ref_ref_t, char **, protocol_family_t, boolean_t);
67 
68 static void dlil_main_input_thread_func(void *, wait_result_t);
69 static void dlil_main_input_thread_cont(void *, wait_result_t);
70 
71 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
72 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
73 
74 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue, dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta);
75 
76 static inline mbuf_t handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt);
77 /*
78  * Publicly visible functions.
79  */
80 
81 int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)82 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
83     thread_continue_t *thfunc)
84 {
85 	boolean_t dlil_rxpoll_input;
86 	thread_continue_t func = NULL;
87 	u_int32_t limit;
88 	int error = 0;
89 
90 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
91 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
92 
93 	/* default strategy utilizes the DLIL worker thread */
94 	inp->dlth_strategy = dlil_input_async;
95 
96 	/* NULL ifp indicates the main input thread, called at dlil_init time */
97 	if (ifp == NULL) {
98 		/*
99 		 * Main input thread only.
100 		 */
101 		func = dlil_main_input_thread_func;
102 		VERIFY(inp == dlil_main_input_thread);
103 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
104 		    "main_input");
105 	} else if (dlil_rxpoll_input) {
106 		/*
107 		 * Legacy (non-netif) hybrid polling.
108 		 */
109 		func = dlil_rxpoll_input_thread_func;
110 		VERIFY(inp != dlil_main_input_thread);
111 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
112 		    "%s_input_poll", if_name(ifp));
113 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
114 		/*
115 		 * Asynchronous strategy.
116 		 */
117 		func = dlil_input_thread_func;
118 		VERIFY(inp != dlil_main_input_thread);
119 		inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
120 		    "%s_input", if_name(ifp));
121 	} else {
122 		/*
123 		 * Synchronous strategy if there's a netif below and
124 		 * the device isn't capable of hybrid polling.
125 		 */
126 		ASSERT(func == NULL);
127 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
128 		VERIFY(inp != dlil_main_input_thread);
129 		ASSERT(!inp->dlth_affinity);
130 		inp->dlth_strategy = dlil_input_sync;
131 		inp->dlth_name = __unsafe_null_terminated_from_indexable(inp->dlth_name_storage);
132 	}
133 	VERIFY(inp->dlth_thread == THREAD_NULL);
134 
135 	/* let caller know */
136 	if (thfunc != NULL) {
137 		*thfunc = func;
138 	}
139 
140 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
141 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
142 
143 	inp->dlth_ifp = ifp; /* NULL for main input thread */
144 
145 	/*
146 	 * For interfaces that support opportunistic polling, set the
147 	 * low and high watermarks for outstanding inbound packets/bytes.
148 	 * Also define freeze times for transitioning between modes
149 	 * and updating the average.
150 	 */
151 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
152 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
153 		if (ifp->if_xflags & IFXF_LEGACY) {
154 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
155 		}
156 	} else {
157 		/*
158 		 * For interfaces that don't support opportunistic
159 		 * polling, set the burst limit to prevent memory exhaustion.
160 		 * The values of `if_rcvq_burst_limit' are safeguarded
161 		 * on customer builds by `sysctl_rcvq_burst_limit'.
162 		 */
163 		limit = if_rcvq_burst_limit;
164 	}
165 
166 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
167 	if (inp == dlil_main_input_thread) {
168 		dlil_main_threading_info_ref_t inpm =
169 		    __container_of(inp, struct dlil_main_threading_info, inp);
170 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
171 	}
172 
173 	if (func == NULL) {
174 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
175 		ASSERT(error == 0);
176 		error = ENODEV;
177 		goto done;
178 	}
179 
180 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
181 	if (error == KERN_SUCCESS) {
182 		thread_precedence_policy_data_t info;
183 		__unused kern_return_t kret;
184 
185 		bzero(&info, sizeof(info));
186 		info.importance = 0;
187 		kret = thread_policy_set(inp->dlth_thread,
188 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
189 		    THREAD_PRECEDENCE_POLICY_COUNT);
190 		ASSERT(kret == KERN_SUCCESS);
191 		/*
192 		 * We create an affinity set so that the matching workloop
193 		 * thread or the starter thread (for loopback) can be
194 		 * scheduled on the same processor set as the input thread.
195 		 */
196 		if (net_affinity) {
197 			struct thread *tp __single = inp->dlth_thread;
198 			u_int32_t tag;
199 			/*
200 			 * Randomize to reduce the probability
201 			 * of affinity tag namespace collision.
202 			 */
203 			read_frandom(&tag, sizeof(tag));
204 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
205 				thread_reference(tp);
206 				inp->dlth_affinity_tag = tag;
207 				inp->dlth_affinity = TRUE;
208 			}
209 		}
210 	} else if (inp == dlil_main_input_thread) {
211 		panic_plain("%s: couldn't create main input thread", __func__);
212 		/* NOTREACHED */
213 	} else {
214 		panic_plain("%s: couldn't create %s input thread", __func__,
215 		    if_name(ifp));
216 		/* NOTREACHED */
217 	}
218 	OSAddAtomic(1, &cur_dlil_input_threads);
219 
220 done:
221 	return error;
222 }
223 
224 void
dlil_terminate_input_thread(struct dlil_threading_info * inp)225 dlil_terminate_input_thread(struct dlil_threading_info *inp)
226 {
227 	ifnet_ref_t ifp = inp->dlth_ifp;
228 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
229 
230 	VERIFY(current_thread() == inp->dlth_thread);
231 	VERIFY(inp != dlil_main_input_thread);
232 
233 	OSAddAtomic(-1, &cur_dlil_input_threads);
234 
235 #if TEST_INPUT_THREAD_TERMINATION
236 	{ /* do something useless that won't get optimized away */
237 		uint32_t        v = 1;
238 		for (uint32_t i = 0;
239 		    i < if_input_thread_termination_spin;
240 		    i++) {
241 			v = (i + 1) * v;
242 		}
243 		DLIL_PRINTF("the value is %d\n", v);
244 	}
245 #endif /* TEST_INPUT_THREAD_TERMINATION */
246 
247 	lck_mtx_lock_spin(&inp->dlth_lock);
248 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
249 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
250 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
251 	wakeup_one((caddr_t)&inp->dlth_flags);
252 	lck_mtx_unlock(&inp->dlth_lock);
253 
254 	/* free up pending packets */
255 	if (pkt.cp_mbuf != NULL) {
256 		mbuf_freem_list(pkt.cp_mbuf);
257 	}
258 
259 	/* for the extra refcnt from kernel_thread_start() */
260 	thread_deallocate(current_thread());
261 
262 	if (dlil_verbose) {
263 		DLIL_PRINTF("%s: input thread terminated\n",
264 		    if_name(ifp));
265 	}
266 
267 	/* this is the end */
268 	thread_terminate(current_thread());
269 	/* NOTREACHED */
270 }
271 
272 boolean_t
dlil_is_rxpoll_input(thread_continue_t func)273 dlil_is_rxpoll_input(thread_continue_t func)
274 {
275 	return func == dlil_rxpoll_input_thread_func;
276 }
277 
278 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)279 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
280     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
281     boolean_t poll, struct thread *tp)
282 {
283 	dlil_threading_info_ref_t inp = ifp->if_inp;
284 
285 	if (__improbable(inp == NULL)) {
286 		inp = dlil_main_input_thread;
287 	}
288 
289 #if (DEVELOPMENT || DEBUG)
290 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
291 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
292 	} else
293 #endif /* (DEVELOPMENT || DEBUG) */
294 	{
295 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
296 	}
297 }
298 
299 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)300 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
301 {
302 	return dlil_input_packet_list_common(ifp, m, 0,
303 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
304 }
305 
306 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)307 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
308     u_int32_t cnt, ifnet_model_t mode)
309 {
310 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
311 }
312 
313 /*
314  * Static function implementations.
315  */
316 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_ref_t m)317 dlil_ifproto_input(struct if_proto * ifproto, mbuf_ref_t m)
318 {
319 	int error;
320 
321 	if (ifproto->proto_kpi == kProtoKPI_v1) {
322 		/* Version 1 protocols get one packet at a time */
323 		while (m != NULL) {
324 			/*
325 			 * Version 1 KPI does not accept header len,
326 			 * hence the pointer to the frame header must be `__single'.
327 			 */
328 			char *frame_header_ptr __single;
329 
330 			mbuf_t next_packet;
331 
332 			next_packet = m->m_nextpkt;
333 			m->m_nextpkt = NULL;
334 			frame_header_ptr = m->m_pkthdr.pkt_hdr;
335 
336 			m->m_pkthdr.pkt_hdr = NULL;
337 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
338 			    ifproto->protocol_family, m, frame_header_ptr);
339 			if (error != 0 && error != EJUSTRETURN) {
340 				m_drop_if(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
341 			}
342 			m = next_packet;
343 		}
344 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
345 		/* Version 2 protocols support packet lists */
346 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
347 		    ifproto->protocol_family, m);
348 		if (error != 0 && error != EJUSTRETURN) {
349 			m_drop_list(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
350 		}
351 	}
352 }
353 
354 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)355 dlil_input_async(struct dlil_threading_info *inp,
356     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
357     const struct ifnet_stat_increment_param *s, boolean_t poll,
358     struct thread *tp)
359 {
360 	u_int32_t m_cnt = s->packets_in;
361 	u_int32_t m_size = s->bytes_in;
362 	boolean_t notify = FALSE;
363 	struct ifnet_stat_increment_param s_adj = *s;
364 	dlil_freeq_t freeq;
365 	MBUFQ_INIT(&freeq);
366 
367 	/*
368 	 * If there is a matching DLIL input thread associated with an
369 	 * affinity set, associate this thread with the same set.  We
370 	 * will only do this once.
371 	 */
372 	lck_mtx_lock_spin(&inp->dlth_lock);
373 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
374 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
375 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
376 		u_int32_t tag = inp->dlth_affinity_tag;
377 
378 		if (poll) {
379 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
380 			inp->dlth_poller_thread = tp;
381 		} else {
382 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
383 			inp->dlth_driver_thread = tp;
384 		}
385 		lck_mtx_unlock(&inp->dlth_lock);
386 
387 		/* Associate the current thread with the new affinity tag */
388 		(void) dlil_affinity_set(tp, tag);
389 
390 		/*
391 		 * Take a reference on the current thread; during detach,
392 		 * we will need to refer to it in order to tear down its
393 		 * affinity.
394 		 */
395 		thread_reference(tp);
396 		lck_mtx_lock_spin(&inp->dlth_lock);
397 	}
398 
399 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
400 
401 	/*
402 	 * Because of loopbacked multicast we cannot stuff the ifp in
403 	 * the rcvif of the packet header: loopback (lo0) packets use a
404 	 * dedicated list so that we can later associate them with lo_ifp
405 	 * on their way up the stack.  Packets for other interfaces without
406 	 * dedicated input threads go to the regular list.
407 	 */
408 	if (m_head != NULL) {
409 		classq_pkt_t head, tail;
410 		class_queue_t *input_queue;
411 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
412 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
413 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
414 			dlil_main_threading_info_ref_t inpm =
415 			    __container_of(inp, struct dlil_main_threading_info, inp);
416 			input_queue = &inpm->lo_rcvq_pkts;
417 		} else {
418 			input_queue = &inp->dlth_pkts;
419 		}
420 
421 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
422 
423 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
424 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
425 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
426 			inp->dlth_trim_cnt += 1;
427 
428 			os_log_error(OS_LOG_DEFAULT,
429 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
430 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
431 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
432 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
433 			    qlen(input_queue));
434 		}
435 	}
436 
437 #if IFNET_INPUT_SANITY_CHK
438 	/*
439 	 * Verify that the original stat increment parameter
440 	 * accurately describes the input chain `m_head`.
441 	 * This is not affected by the trimming of input queue.
442 	 */
443 	if (__improbable(dlil_input_sanity_check != 0)) {
444 		u_int32_t count = 0, size = 0;
445 		struct mbuf *m0;
446 
447 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
448 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
449 			size += m_length(m0);
450 			count++;
451 		}
452 
453 		if (count != m_cnt) {
454 			panic_plain("%s: invalid total packet count %u "
455 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
456 			/* NOTREACHED */
457 			__builtin_unreachable();
458 		} else if (size != m_size) {
459 			panic_plain("%s: invalid total packet size %u "
460 			    "(expected %u)\n", if_name(ifp), size, m_size);
461 			/* NOTREACHED */
462 			__builtin_unreachable();
463 		}
464 
465 		inp->dlth_pkts_cnt += m_cnt;
466 	}
467 #else
468 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
469 #endif /* IFNET_INPUT_SANITY_CHK */
470 
471 	/* NOTE: use the adjusted parameter, vs the original one */
472 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
473 	/*
474 	 * If we're using the main input thread, synchronize the
475 	 * stats now since we have the interface context.  All
476 	 * other cases involving dedicated input threads will
477 	 * have their stats synchronized there.
478 	 */
479 	if (inp == dlil_main_input_thread) {
480 		notify = dlil_input_stats_sync(ifp, inp);
481 	}
482 
483 	dlil_input_wakeup(inp);
484 	lck_mtx_unlock(&inp->dlth_lock);
485 
486 	/*
487 	 * Actual freeing of the excess packets must happen
488 	 * after the dlth_lock had been released.
489 	 */
490 	if (!MBUFQ_EMPTY(&freeq)) {
491 		m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
492 	}
493 
494 	if (notify) {
495 		ifnet_notify_data_threshold(ifp);
496 	}
497 
498 	return 0;
499 }
500 
501 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)502 dlil_input_sync(struct dlil_threading_info *inp,
503     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
504     const struct ifnet_stat_increment_param *s, boolean_t poll,
505     struct thread *tp)
506 {
507 #pragma unused(tp)
508 	u_int32_t m_cnt = s->packets_in;
509 	u_int32_t m_size = s->bytes_in;
510 	boolean_t notify = FALSE;
511 	classq_pkt_t head, tail;
512 	struct ifnet_stat_increment_param s_adj = *s;
513 	dlil_freeq_t freeq;
514 	MBUFQ_INIT(&freeq);
515 
516 	ASSERT(inp != dlil_main_input_thread);
517 
518 	/* XXX: should we just assert instead? */
519 	if (__improbable(m_head == NULL)) {
520 		return 0;
521 	}
522 
523 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
524 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
525 
526 	lck_mtx_lock_spin(&inp->dlth_lock);
527 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
528 
529 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
530 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
531 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
532 		inp->dlth_trim_cnt += 1;
533 
534 		os_log_error(OS_LOG_DEFAULT,
535 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
536 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
537 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
538 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
539 		    qlen(&inp->dlth_pkts));
540 	}
541 
542 #if IFNET_INPUT_SANITY_CHK
543 	if (__improbable(dlil_input_sanity_check != 0)) {
544 		u_int32_t count = 0, size = 0;
545 		struct mbuf *m0;
546 
547 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
548 			m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
549 			size += m_length(m0);
550 			count++;
551 		}
552 
553 		if (count != m_cnt) {
554 			panic_plain("%s: invalid total packet count %u "
555 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
556 			/* NOTREACHED */
557 			__builtin_unreachable();
558 		} else if (size != m_size) {
559 			panic_plain("%s: invalid total packet size %u "
560 			    "(expected %u)\n", if_name(ifp), size, m_size);
561 			/* NOTREACHED */
562 			__builtin_unreachable();
563 		}
564 
565 		inp->dlth_pkts_cnt += m_cnt;
566 	}
567 #else
568 	m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
569 #endif /* IFNET_INPUT_SANITY_CHK */
570 
571 	/* NOTE: use the adjusted parameter, vs the original one */
572 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
573 
574 	m_cnt = qlen(&inp->dlth_pkts);
575 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
576 
577 #if SKYWALK
578 	/*
579 	 * If this interface is attached to a netif nexus,
580 	 * the stats are already incremented there; otherwise
581 	 * do it here.
582 	 */
583 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
584 #endif /* SKYWALK */
585 	notify = dlil_input_stats_sync(ifp, inp);
586 
587 	lck_mtx_unlock(&inp->dlth_lock);
588 
589 	/*
590 	 * Actual freeing of the excess packets must happen
591 	 * after the dlth_lock had been released.
592 	 */
593 	if (!MBUFQ_EMPTY(&freeq)) {
594 		m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
595 	}
596 
597 	if (notify) {
598 		ifnet_notify_data_threshold(ifp);
599 	}
600 
601 	/*
602 	 * NOTE warning %%% attention !!!!
603 	 * We should think about putting some thread starvation
604 	 * safeguards if we deal with long chains of packets.
605 	 */
606 	if (head.cp_mbuf != NULL) {
607 		dlil_input_packet_list_extended(ifp, head.cp_mbuf,
608 		    m_cnt, ifp->if_poll_mode);
609 	}
610 
611 	return 0;
612 }
613 
614 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)615 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
616     protocol_family_t pf)
617 {
618 	uint16_t sum = 0;
619 	uint32_t hlen;
620 
621 	if (frame_header == NULL ||
622 	    frame_header < (char *)mbuf_datastart(m) ||
623 	    frame_header > (char *)m->m_data) {
624 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
625 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
626 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
627 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
628 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
629 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
630 		return;
631 	}
632 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
633 
634 	switch (pf) {
635 	case PF_INET:
636 	case PF_INET6:
637 		break;
638 	default:
639 		return;
640 	}
641 
642 	/*
643 	 * Force partial checksum offload; useful to simulate cases
644 	 * where the hardware does not support partial checksum offload,
645 	 * in order to validate correctness throughout the layers above.
646 	 */
647 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
648 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
649 
650 		if (foff > (uint32_t)m->m_pkthdr.len) {
651 			return;
652 		}
653 
654 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
655 
656 		/* Compute 16-bit 1's complement sum from forced offset */
657 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
658 
659 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
660 		m->m_pkthdr.csum_rx_val = sum;
661 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
662 
663 		hwcksum_dbg_partial_forced++;
664 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
665 	}
666 
667 	/*
668 	 * Partial checksum offload verification (and adjustment);
669 	 * useful to validate and test cases where the hardware
670 	 * supports partial checksum offload.
671 	 */
672 	if ((m->m_pkthdr.csum_flags &
673 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
674 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
675 		uint32_t rxoff;
676 
677 		/* Start offset must begin after frame header */
678 		rxoff = m->m_pkthdr.csum_rx_start;
679 		if (hlen > rxoff) {
680 			hwcksum_dbg_bad_rxoff++;
681 			if (dlil_verbose) {
682 				DLIL_PRINTF("%s: partial cksum start offset %d "
683 				    "is less than frame header length %d for "
684 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
685 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
686 			}
687 			return;
688 		}
689 		rxoff -= hlen;
690 
691 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
692 			/*
693 			 * Compute the expected 16-bit 1's complement sum;
694 			 * skip this if we've already computed it above
695 			 * when partial checksum offload is forced.
696 			 */
697 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
698 
699 			/* Hardware or driver is buggy */
700 			if (sum != m->m_pkthdr.csum_rx_val) {
701 				hwcksum_dbg_bad_cksum++;
702 				if (dlil_verbose) {
703 					DLIL_PRINTF("%s: bad partial cksum value "
704 					    "0x%x (expected 0x%x) for mbuf "
705 					    "0x%llx [rx_start %d]\n",
706 					    if_name(ifp),
707 					    m->m_pkthdr.csum_rx_val, sum,
708 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
709 					    m->m_pkthdr.csum_rx_start);
710 				}
711 				return;
712 			}
713 		}
714 		hwcksum_dbg_verified++;
715 
716 		/*
717 		 * This code allows us to emulate various hardwares that
718 		 * perform 16-bit 1's complement sum beginning at various
719 		 * start offset values.
720 		 */
721 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
722 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
723 
724 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
725 				return;
726 			}
727 
728 			sum = m_adj_sum16(m, rxoff, aoff,
729 			    m_pktlen(m) - aoff, sum);
730 
731 			m->m_pkthdr.csum_rx_val = sum;
732 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
733 
734 			hwcksum_dbg_adjusted++;
735 		}
736 	}
737 }
738 
739 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,mbuf_ref_t m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)740 dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m,
741     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
742 {
743 	int error = 0;
744 	protocol_family_t protocol_family;
745 	mbuf_t next_packet;
746 	ifnet_t ifp = ifp_param;
747 	char *__single frame_header = NULL;
748 	if_proto_ref_t last_ifproto = NULL;
749 	mbuf_t pkt_first = NULL;
750 	mbuf_t *pkt_next = NULL;
751 	u_int32_t poll_thresh = 0, poll_ival = 0;
752 	int iorefcnt = 0;
753 	boolean_t skip_bridge_filter = FALSE;
754 
755 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
756 
757 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
758 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
759 		poll_thresh = cnt;
760 	}
761 	if (bridge_enable_early_input != 0 &&
762 	    ifp != NULL && ifp->if_bridge != NULL) {
763 		m = handle_bridge_early_input(ifp, m, cnt);
764 		skip_bridge_filter = TRUE;
765 	}
766 	while (m != NULL) {
767 		if_proto_ref_t ifproto = NULL;
768 		uint32_t pktf_mask;     /* pkt flags to preserve */
769 
770 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
771 		m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
772 
773 		if (ifp_param == NULL) {
774 			ifp = m->m_pkthdr.rcvif;
775 		}
776 
777 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
778 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
779 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
780 			ifnet_poll(ifp);
781 		}
782 
783 		/* Check if this mbuf looks valid */
784 		MBUF_INPUT_CHECK(m, ifp);
785 
786 		next_packet = m->m_nextpkt;
787 		m->m_nextpkt = NULL;
788 		frame_header = m->m_pkthdr.pkt_hdr;
789 		m->m_pkthdr.pkt_hdr = NULL;
790 
791 		/*
792 		 * Get an IO reference count if the interface is not
793 		 * loopback (lo0) and it is attached; lo0 never goes
794 		 * away, so optimize for that.
795 		 */
796 		if (ifp != lo_ifp) {
797 			/* iorefcnt is 0 if it hasn't been taken yet */
798 			if (iorefcnt == 0) {
799 				if (!ifnet_datamov_begin(ifp)) {
800 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_DATAMOV_BEGIN, NULL, 0);
801 					goto next;
802 				}
803 			}
804 			iorefcnt = 1;
805 			/*
806 			 * Preserve the time stamp and skip pktap flags.
807 			 */
808 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
809 		} else {
810 			/*
811 			 * If this arrived on lo0, preserve interface addr
812 			 * info to allow for connectivity between loopback
813 			 * and local interface addresses.
814 			 */
815 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
816 		}
817 		pktf_mask |= PKTF_WAKE_PKT;
818 
819 		/* make sure packet comes in clean */
820 		m_classifier_init(m, pktf_mask);
821 
822 		ifp_inc_traffic_class_in(ifp, m);
823 
824 		/* find which protocol family this packet is for */
825 		ifnet_lock_shared(ifp);
826 		error = (*ifp->if_demux)(ifp, m, frame_header,
827 		    &protocol_family);
828 		ifnet_lock_done(ifp);
829 		if (error != 0) {
830 			if (error == EJUSTRETURN) {
831 				goto next;
832 			}
833 			protocol_family = 0;
834 		}
835 		/* check for an updated frame header */
836 		if (m->m_pkthdr.pkt_hdr != NULL) {
837 			frame_header = m->m_pkthdr.pkt_hdr;
838 			m->m_pkthdr.pkt_hdr = NULL;
839 		}
840 
841 #if (DEVELOPMENT || DEBUG)
842 		/*
843 		 * For testing we do not care about broadcast and multicast packets as
844 		 * they are not as controllable as unicast traffic
845 		 */
846 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
847 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
848 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
849 				/*
850 				 * This is a one-shot command
851 				 */
852 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
853 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
854 			}
855 		}
856 #endif /* (DEVELOPMENT || DEBUG) */
857 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
858 			char buffer[64];
859 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
860 
861 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
862 			    ifp->if_xname, m_pktlen(m));
863 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
864 				log_hexdump(buffer, buflen);
865 			}
866 		}
867 
868 		pktap_input(ifp, protocol_family, m, frame_header);
869 
870 		/* Drop v4 packets received on CLAT46 enabled cell interface */
871 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
872 		    ifp->if_type == IFT_CELLULAR) {
873 			m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
874 			ip6stat.ip6s_clat464_in_v4_drop++;
875 			goto next;
876 		}
877 
878 		/* Translate the packet if it is received on CLAT interface */
879 		if ((m->m_flags & M_PROMISC) == 0 &&
880 		    protocol_family == PF_INET6 &&
881 		    IS_INTF_CLAT46(ifp) &&
882 		    dlil_is_clat_needed(protocol_family, m)) {
883 			char *data = NULL;
884 			struct ether_header eh;
885 			struct ether_header *ehp = NULL;
886 
887 			if (ifp->if_type == IFT_ETHER) {
888 				ehp = (struct ether_header *)(void *)frame_header;
889 				/* Skip RX Ethernet packets if they are not IPV6 */
890 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
891 					goto skip_clat;
892 				}
893 
894 				/* Keep a copy of frame_header for Ethernet packets */
895 				char *fh = __unsafe_forge_bidi_indexable(char *, m->m_pkthdr.pkt_hdr, ifnet_hdrlen(ifp));
896 				if (fh) {
897 					bcopy(fh, (caddr_t)&eh, ETHER_HDR_LEN);
898 				}
899 			}
900 			error = dlil_clat64(ifp, &protocol_family, &m);
901 			data = mtod(m, char*);
902 			if (error != 0) {
903 				m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
904 				ip6stat.ip6s_clat464_in_drop++;
905 				goto next;
906 			}
907 			/* Native v6 should be No-op */
908 			if (protocol_family != PF_INET) {
909 				goto skip_clat;
910 			}
911 
912 			/* Do this only for translated v4 packets. */
913 			switch (ifp->if_type) {
914 			case IFT_CELLULAR:
915 				frame_header = data;
916 				break;
917 			case IFT_ETHER:
918 				/*
919 				 * Drop if the mbuf doesn't have enough
920 				 * space for Ethernet header
921 				 */
922 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
923 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
924 					ip6stat.ip6s_clat464_in_drop++;
925 					goto next;
926 				}
927 				/*
928 				 * Set the frame_header ETHER_HDR_LEN bytes
929 				 * preceeding the data pointer. Change
930 				 * the ether_type too.
931 				 * N.B. The variable `fh' is needed because
932 				 * the `frame_header' variable is `__single',
933 				 * and hence would not be appropriate for use with `bcopy'.
934 				 */
935 				char *fh = data - ETHER_HDR_LEN;
936 				frame_header = fh;
937 				eh.ether_type = htons(ETHERTYPE_IP);
938 				bcopy((caddr_t)&eh, fh, ETHER_HDR_LEN);
939 				break;
940 			}
941 		}
942 skip_clat:
943 		/*
944 		 * Match the wake packet against the list of ports that has been
945 		 * been queried by the driver before the device went to sleep
946 		 */
947 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
948 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
949 				if_ports_used_match_mbuf(ifp, protocol_family, m);
950 			}
951 		}
952 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
953 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
954 			dlil_input_cksum_dbg(ifp, m, frame_header,
955 			    protocol_family);
956 		}
957 		/*
958 		 * For partial checksum offload, we expect the driver to
959 		 * set the start offset indicating the start of the span
960 		 * that is covered by the hardware-computed checksum;
961 		 * adjust this start offset accordingly because the data
962 		 * pointer has been advanced beyond the link-layer header.
963 		 *
964 		 * Virtual lan types (bridge, vlan, bond) can call
965 		 * dlil_input_packet_list() with the same packet with the
966 		 * checksum flags set. Set a flag indicating that the
967 		 * adjustment has already been done.
968 		 */
969 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
970 			/* adjustment has already been done */
971 		} else if ((m->m_pkthdr.csum_flags &
972 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
973 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
974 			int adj;
975 			if (frame_header == NULL ||
976 			    frame_header < (char *)mbuf_datastart(m) ||
977 			    frame_header > (char *)m->m_data ||
978 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
979 			    m->m_pkthdr.csum_rx_start) {
980 				m->m_pkthdr.csum_data = 0;
981 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
982 				hwcksum_in_invalidated++;
983 			} else {
984 				m->m_pkthdr.csum_rx_start -= adj;
985 			}
986 			/* make sure we don't adjust more than once */
987 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
988 		}
989 		if (clat_debug) {
990 			pktap_input(ifp, protocol_family, m, frame_header);
991 		}
992 
993 		if (m->m_flags & (M_BCAST | M_MCAST)) {
994 			os_atomic_inc(&ifp->if_imcasts, relaxed);
995 		}
996 
997 		/* run interface filters */
998 		error = dlil_interface_filters_input(ifp, &m,
999 		    &frame_header, protocol_family, skip_bridge_filter);
1000 		if (error != 0) {
1001 			if (error != EJUSTRETURN) {
1002 				m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
1003 			}
1004 			goto next;
1005 		}
1006 		/*
1007 		 * A VLAN and Bond interface receives packets by attaching
1008 		 * a "protocol" to the underlying interface.
1009 		 * A promiscuous packet needs to be delivered to the
1010 		 * VLAN or Bond interface since:
1011 		 * - Bond interface member may not support setting the
1012 		 *   MAC address, so packets are inherently "promiscuous"
1013 		 * - A VLAN or Bond interface could be members of a bridge,
1014 		 *   where promiscuous packets correspond to other
1015 		 *   devices that the bridge forwards packets to/from
1016 		 */
1017 		if ((m->m_flags & M_PROMISC) != 0) {
1018 			switch (protocol_family) {
1019 			case PF_VLAN:
1020 			case PF_BOND:
1021 				/* VLAN and Bond get promiscuous packets */
1022 				break;
1023 			default:
1024 				if (droptap_verbose > 0) {
1025 					m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_PROMISC, NULL, 0);
1026 				} else {
1027 					m_freem(m);
1028 				}
1029 				goto next;
1030 			}
1031 		}
1032 
1033 		/* Lookup the protocol attachment to this interface */
1034 		if (protocol_family == 0) {
1035 			ifproto = NULL;
1036 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
1037 		    (last_ifproto->protocol_family == protocol_family)) {
1038 			VERIFY(ifproto == NULL);
1039 			ifproto = last_ifproto;
1040 			if_proto_ref(last_ifproto);
1041 		} else {
1042 			VERIFY(ifproto == NULL);
1043 			ifnet_lock_shared(ifp);
1044 			/* callee holds a proto refcnt upon success */
1045 			ifproto = find_attached_proto(ifp, protocol_family);
1046 			ifnet_lock_done(ifp);
1047 		}
1048 		if (ifproto == NULL) {
1049 			/* no protocol for this packet, discard */
1050 			m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0);
1051 			goto next;
1052 		}
1053 		if (ifproto != last_ifproto) {
1054 			if (last_ifproto != NULL) {
1055 				/* pass up the list for the previous protocol */
1056 				dlil_ifproto_input(last_ifproto, pkt_first);
1057 				pkt_first = NULL;
1058 				if_proto_free(last_ifproto);
1059 			}
1060 			last_ifproto = ifproto;
1061 			if_proto_ref(ifproto);
1062 		}
1063 		/* extend the list */
1064 		m->m_pkthdr.pkt_hdr = frame_header;
1065 		if (pkt_first == NULL) {
1066 			pkt_first = m;
1067 		} else {
1068 			*pkt_next = m;
1069 		}
1070 		pkt_next = &m->m_nextpkt;
1071 
1072 next:
1073 		if (next_packet == NULL && last_ifproto != NULL) {
1074 			/* pass up the last list of packets */
1075 			dlil_ifproto_input(last_ifproto, pkt_first);
1076 			if_proto_free(last_ifproto);
1077 			last_ifproto = NULL;
1078 		}
1079 		if (ifproto != NULL) {
1080 			if_proto_free(ifproto);
1081 			ifproto = NULL;
1082 		}
1083 
1084 		m = next_packet;
1085 
1086 		/* update the driver's multicast filter, if needed */
1087 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
1088 			ifp->if_updatemcasts = 0;
1089 		}
1090 		if (iorefcnt == 1) {
1091 			/* If the next mbuf is on a different interface, unlock data-mov */
1092 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
1093 				ifnet_datamov_end(ifp);
1094 				iorefcnt = 0;
1095 			}
1096 		}
1097 	}
1098 
1099 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
1100 }
1101 
1102 /*
1103  * Input thread for interfaces with legacy input model.
1104  */
1105 __attribute__((noreturn))
1106 static void
dlil_input_thread_func(void * v,wait_result_t w)1107 dlil_input_thread_func(void *v, wait_result_t w)
1108 {
1109 #pragma unused(w)
1110 	char thread_name_storage[MAXTHREADNAMESIZE];
1111 	const char *__null_terminated thread_name;
1112 	dlil_threading_info_ref_t inp = v;
1113 	ifnet_ref_t ifp = inp->dlth_ifp;
1114 
1115 	VERIFY(inp != dlil_main_input_thread);
1116 	VERIFY(ifp != NULL);
1117 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
1118 	    !(ifp->if_xflags & IFXF_LEGACY));
1119 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
1120 	    !(ifp->if_xflags & IFXF_LEGACY));
1121 	VERIFY(current_thread() == inp->dlth_thread);
1122 
1123 	/* construct the name for this thread, and then apply it */
1124 	bzero(thread_name_storage, sizeof(thread_name_storage));
1125 	thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1126 	    "dlil_input_%s", ifp->if_xname);
1127 	thread_set_thread_name(inp->dlth_thread, thread_name);
1128 
1129 	lck_mtx_lock(&inp->dlth_lock);
1130 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1131 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1132 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1133 	/* wake up once to get out of embryonic state */
1134 	dlil_input_wakeup(inp);
1135 	lck_mtx_unlock(&inp->dlth_lock);
1136 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
1137 	/* NOTREACHED */
1138 	__builtin_unreachable();
1139 }
1140 
1141 __attribute__((noreturn))
1142 static void
dlil_input_thread_cont(void * v,wait_result_t wres)1143 dlil_input_thread_cont(void *v, wait_result_t wres)
1144 {
1145 	dlil_threading_info_ref_t inp = v;
1146 	ifnet_ref_t ifp = inp->dlth_ifp;
1147 
1148 	lck_mtx_lock_spin(&inp->dlth_lock);
1149 	if (__improbable(wres == THREAD_INTERRUPTED ||
1150 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1151 		goto terminate;
1152 	}
1153 
1154 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1155 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1156 
1157 	while (1) {
1158 		struct mbuf *m = NULL;
1159 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1160 		boolean_t notify = FALSE;
1161 		boolean_t embryonic;
1162 		u_int32_t m_cnt;
1163 
1164 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1165 
1166 		if (__improbable(embryonic =
1167 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1168 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1169 		}
1170 
1171 		/*
1172 		 * Protocol registration and injection must always use
1173 		 * the main input thread; in theory the latter can utilize
1174 		 * the corresponding input thread where the packet arrived
1175 		 * on, but that requires our knowing the interface in advance
1176 		 * (and the benefits might not worth the trouble.)
1177 		 */
1178 		VERIFY(!(inp->dlth_flags &
1179 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1180 
1181 		/* Packets for this interface */
1182 		m_cnt = qlen(&inp->dlth_pkts);
1183 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1184 		m = pkt.cp_mbuf;
1185 
1186 		inp->dlth_wtot = 0;
1187 
1188 #if SKYWALK
1189 		/*
1190 		 * If this interface is attached to a netif nexus,
1191 		 * the stats are already incremented there; otherwise
1192 		 * do it here.
1193 		 */
1194 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
1195 #endif /* SKYWALK */
1196 		notify = dlil_input_stats_sync(ifp, inp);
1197 
1198 		lck_mtx_unlock(&inp->dlth_lock);
1199 
1200 		if (__improbable(embryonic)) {
1201 			ifnet_decr_pending_thread_count(ifp);
1202 		}
1203 
1204 		if (__improbable(notify)) {
1205 			ifnet_notify_data_threshold(ifp);
1206 		}
1207 
1208 		/*
1209 		 * NOTE warning %%% attention !!!!
1210 		 * We should think about putting some thread starvation
1211 		 * safeguards if we deal with long chains of packets.
1212 		 */
1213 		if (__probable(m != NULL)) {
1214 			dlil_input_packet_list_extended(ifp, m,
1215 			    m_cnt, ifp->if_poll_mode);
1216 		}
1217 
1218 		lck_mtx_lock_spin(&inp->dlth_lock);
1219 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1220 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1221 		    DLIL_INPUT_TERMINATE))) {
1222 			break;
1223 		}
1224 	}
1225 
1226 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1227 
1228 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1229 terminate:
1230 		lck_mtx_unlock(&inp->dlth_lock);
1231 		dlil_terminate_input_thread(inp);
1232 		/* NOTREACHED */
1233 	} else {
1234 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1235 		lck_mtx_unlock(&inp->dlth_lock);
1236 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
1237 		/* NOTREACHED */
1238 	}
1239 
1240 	VERIFY(0);      /* we should never get here */
1241 	/* NOTREACHED */
1242 	__builtin_unreachable();
1243 }
1244 
1245 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)1246 dlil_input_wakeup(struct dlil_threading_info *inp)
1247 {
1248 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
1249 
1250 	inp->dlth_flags |= DLIL_INPUT_WAITING;
1251 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
1252 		inp->dlth_wtot++;
1253 		wakeup_one((caddr_t)&inp->dlth_flags);
1254 	}
1255 }
1256 
1257 static int
dlil_interface_filters_input(struct ifnet * ifp,mbuf_ref_ref_t m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)1258 dlil_interface_filters_input(struct ifnet *ifp, mbuf_ref_ref_t m_p,
1259     char **frame_header_p, protocol_family_t protocol_family,
1260     boolean_t skip_bridge)
1261 {
1262 	boolean_t               is_vlan_packet = FALSE;
1263 	struct ifnet_filter     *filter;
1264 	struct mbuf             *m = *m_p;
1265 
1266 	is_vlan_packet = packet_has_vlan_tag(m);
1267 
1268 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
1269 		return 0;
1270 	}
1271 
1272 	/*
1273 	 * Pass the inbound packet to the interface filters
1274 	 */
1275 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1276 	/* prevent filter list from changing in case we drop the lock */
1277 	if_flt_monitor_busy(ifp);
1278 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
1279 		int result;
1280 
1281 		/* exclude VLAN packets from external filters PR-3586856 */
1282 		if (is_vlan_packet &&
1283 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
1284 			continue;
1285 		}
1286 		/* the bridge has already seen the packet */
1287 		if (skip_bridge &&
1288 		    (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
1289 			continue;
1290 		}
1291 		if (!filter->filt_skip && filter->filt_input != NULL &&
1292 		    (filter->filt_protocol == 0 ||
1293 		    filter->filt_protocol == protocol_family)) {
1294 			lck_mtx_unlock(&ifp->if_flt_lock);
1295 
1296 			result = (*filter->filt_input)(filter->filt_cookie,
1297 			    ifp, protocol_family, m_p, frame_header_p);
1298 
1299 			lck_mtx_lock_spin(&ifp->if_flt_lock);
1300 			if (result != 0) {
1301 				/* we're done with the filter list */
1302 				if_flt_monitor_unbusy(ifp);
1303 				lck_mtx_unlock(&ifp->if_flt_lock);
1304 				return result;
1305 			}
1306 		}
1307 	}
1308 	/* we're done with the filter list */
1309 	if_flt_monitor_unbusy(ifp);
1310 	lck_mtx_unlock(&ifp->if_flt_lock);
1311 
1312 	/*
1313 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
1314 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
1315 	 */
1316 	if (*m_p != NULL) {
1317 		(*m_p)->m_flags &= ~M_PROTO1;
1318 	}
1319 
1320 	return 0;
1321 }
1322 
1323 __attribute__((noreturn))
1324 static void
dlil_main_input_thread_func(void * v,wait_result_t w)1325 dlil_main_input_thread_func(void *v, wait_result_t w)
1326 {
1327 #pragma unused(w)
1328 	dlil_threading_info_ref_t inp = v;
1329 
1330 	VERIFY(inp == dlil_main_input_thread);
1331 	VERIFY(inp->dlth_ifp == NULL);
1332 	VERIFY(current_thread() == inp->dlth_thread);
1333 
1334 	lck_mtx_lock(&inp->dlth_lock);
1335 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1336 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1337 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1338 	/* wake up once to get out of embryonic state */
1339 	dlil_input_wakeup(inp);
1340 	lck_mtx_unlock(&inp->dlth_lock);
1341 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1342 	/* NOTREACHED */
1343 	__builtin_unreachable();
1344 }
1345 
1346 /*
1347  * Main input thread:
1348  *
1349  *   a) handles all inbound packets for lo0
1350  *   b) handles all inbound packets for interfaces with no dedicated
1351  *	input thread (e.g. anything but Ethernet/PDP or those that support
1352  *	opportunistic polling.)
1353  *   c) protocol registrations
1354  *   d) packet injections
1355  */
1356 __attribute__((noreturn))
1357 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)1358 dlil_main_input_thread_cont(void *v, wait_result_t wres)
1359 {
1360 	dlil_main_threading_info_ref_t inpm = v;
1361 	dlil_threading_info_ref_t inp = v;
1362 
1363 	/* main input thread is uninterruptible */
1364 	VERIFY(wres != THREAD_INTERRUPTED);
1365 	lck_mtx_lock_spin(&inp->dlth_lock);
1366 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
1367 	    DLIL_INPUT_RUNNING)));
1368 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1369 
1370 	while (1) {
1371 		struct mbuf *m = NULL, *m_loop = NULL;
1372 		u_int32_t m_cnt, m_cnt_loop;
1373 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1374 		boolean_t proto_req;
1375 		boolean_t embryonic;
1376 
1377 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1378 
1379 		if (__improbable(embryonic =
1380 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1381 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1382 		}
1383 
1384 		proto_req = (inp->dlth_flags &
1385 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
1386 
1387 		/* Packets for non-dedicated interfaces other than lo0 */
1388 		m_cnt = qlen(&inp->dlth_pkts);
1389 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1390 		m = pkt.cp_mbuf;
1391 
1392 		/* Packets exclusive to lo0 */
1393 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
1394 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
1395 		m_loop = pkt.cp_mbuf;
1396 
1397 		inp->dlth_wtot = 0;
1398 
1399 		lck_mtx_unlock(&inp->dlth_lock);
1400 
1401 		if (__improbable(embryonic)) {
1402 			dlil_decr_pending_thread_count();
1403 		}
1404 
1405 		/*
1406 		 * NOTE warning %%% attention !!!!
1407 		 * We should think about putting some thread starvation
1408 		 * safeguards if we deal with long chains of packets.
1409 		 */
1410 		if (__probable(m_loop != NULL)) {
1411 			dlil_input_packet_list_extended(lo_ifp, m_loop,
1412 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
1413 		}
1414 
1415 		if (__probable(m != NULL)) {
1416 			dlil_input_packet_list_extended(NULL, m,
1417 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
1418 		}
1419 
1420 		if (__improbable(proto_req)) {
1421 			proto_input_run();
1422 		}
1423 
1424 		lck_mtx_lock_spin(&inp->dlth_lock);
1425 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1426 		/* main input thread cannot be terminated */
1427 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
1428 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
1429 			break;
1430 		}
1431 	}
1432 
1433 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1434 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1435 	lck_mtx_unlock(&inp->dlth_lock);
1436 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
1437 
1438 	VERIFY(0);      /* we should never get here */
1439 	/* NOTREACHED */
1440 	__builtin_unreachable();
1441 }
1442 
1443 /*
1444  * Input thread for interfaces with opportunistic polling input model.
1445  */
1446 __attribute__((noreturn))
1447 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)1448 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
1449 {
1450 #pragma unused(w)
1451 	char thread_name_storage[MAXTHREADNAMESIZE];
1452 	const char *__null_terminated thread_name;
1453 	dlil_threading_info_ref_t inp = v;
1454 	ifnet_ref_t ifp = inp->dlth_ifp;
1455 
1456 	VERIFY(inp != dlil_main_input_thread);
1457 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
1458 	    (ifp->if_xflags & IFXF_LEGACY));
1459 	VERIFY(current_thread() == inp->dlth_thread);
1460 
1461 	/* construct the name for this thread, and then apply it */
1462 	bzero(thread_name_storage, sizeof(thread_name_storage));
1463 	thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
1464 	    "dlil_input_poll_%s", ifp->if_xname);
1465 	thread_set_thread_name(inp->dlth_thread, thread_name);
1466 
1467 	lck_mtx_lock(&inp->dlth_lock);
1468 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
1469 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1470 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
1471 	/* wake up once to get out of embryonic state */
1472 	dlil_input_wakeup(inp);
1473 	lck_mtx_unlock(&inp->dlth_lock);
1474 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
1475 	/* NOTREACHED */
1476 	__builtin_unreachable();
1477 }
1478 
1479 __attribute__((noreturn))
1480 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)1481 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
1482 {
1483 	dlil_threading_info_ref_t inp = v;
1484 	ifnet_ref_t ifp = inp->dlth_ifp;
1485 	struct timespec ts;
1486 
1487 	lck_mtx_lock_spin(&inp->dlth_lock);
1488 	if (__improbable(wres == THREAD_INTERRUPTED ||
1489 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
1490 		goto terminate;
1491 	}
1492 
1493 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
1494 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
1495 
1496 	while (1) {
1497 		struct mbuf *m = NULL;
1498 		uint32_t m_cnt, poll_req = 0;
1499 		uint64_t m_size = 0;
1500 		ifnet_model_t mode;
1501 		struct timespec now, delta;
1502 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
1503 		boolean_t notify;
1504 		boolean_t embryonic;
1505 		uint64_t ival;
1506 
1507 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
1508 
1509 		if (__improbable(embryonic =
1510 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
1511 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
1512 			goto skip;
1513 		}
1514 
1515 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
1516 			ival = IF_RXPOLL_INTERVALTIME_MIN;
1517 		}
1518 
1519 		/* Link parameters changed? */
1520 		if (ifp->if_poll_update != 0) {
1521 			ifp->if_poll_update = 0;
1522 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
1523 		}
1524 
1525 		/* Current operating mode */
1526 		mode = ifp->if_poll_mode;
1527 
1528 		/*
1529 		 * Protocol registration and injection must always use
1530 		 * the main input thread; in theory the latter can utilize
1531 		 * the corresponding input thread where the packet arrived
1532 		 * on, but that requires our knowing the interface in advance
1533 		 * (and the benefits might not worth the trouble.)
1534 		 */
1535 		VERIFY(!(inp->dlth_flags &
1536 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
1537 
1538 		/* Total count of all packets */
1539 		m_cnt = qlen(&inp->dlth_pkts);
1540 
1541 		/* Total bytes of all packets */
1542 		m_size = qsize(&inp->dlth_pkts);
1543 
1544 		/* Packets for this interface */
1545 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
1546 		m = pkt.cp_mbuf;
1547 		VERIFY(m != NULL || m_cnt == 0);
1548 
1549 		nanouptime(&now);
1550 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
1551 			*(&ifp->if_poll_sample_lasttime) = *(&now);
1552 		}
1553 
1554 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
1555 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
1556 			u_int32_t ptot, btot;
1557 
1558 			/* Accumulate statistics for current sampling */
1559 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
1560 
1561 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
1562 				goto skip;
1563 			}
1564 
1565 			*(&ifp->if_poll_sample_lasttime) = *(&now);
1566 
1567 			/* Calculate min/max of inbound bytes */
1568 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
1569 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
1570 				ifp->if_rxpoll_bmin = btot;
1571 			}
1572 			if (btot > ifp->if_rxpoll_bmax) {
1573 				ifp->if_rxpoll_bmax = btot;
1574 			}
1575 
1576 			/* Calculate EWMA of inbound bytes */
1577 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
1578 
1579 			/* Calculate min/max of inbound packets */
1580 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
1581 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
1582 				ifp->if_rxpoll_pmin = ptot;
1583 			}
1584 			if (ptot > ifp->if_rxpoll_pmax) {
1585 				ifp->if_rxpoll_pmax = ptot;
1586 			}
1587 
1588 			/* Calculate EWMA of inbound packets */
1589 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
1590 
1591 			/* Reset sampling statistics */
1592 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
1593 
1594 			/* Calculate EWMA of wakeup requests */
1595 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
1596 			    if_rxpoll_decay);
1597 			inp->dlth_wtot = 0;
1598 
1599 			if (dlil_verbose) {
1600 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
1601 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
1602 				}
1603 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
1604 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
1605 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
1606 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
1607 					    "limits [%d/%d], wreq avg %d "
1608 					    "limits [%d/%d], bytes avg %d "
1609 					    "limits [%d/%d]\n", if_name(ifp),
1610 					    (ifp->if_poll_mode ==
1611 					    IFNET_MODEL_INPUT_POLL_ON) ?
1612 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
1613 					    ifp->if_rxpoll_pmax,
1614 					    ifp->if_rxpoll_plowat,
1615 					    ifp->if_rxpoll_phiwat,
1616 					    ifp->if_rxpoll_wavg,
1617 					    ifp->if_rxpoll_wlowat,
1618 					    ifp->if_rxpoll_whiwat,
1619 					    ifp->if_rxpoll_bavg,
1620 					    ifp->if_rxpoll_blowat,
1621 					    ifp->if_rxpoll_bhiwat);
1622 				}
1623 			}
1624 
1625 			/* Perform mode transition, if necessary */
1626 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
1627 				*(&ifp->if_poll_mode_lasttime) = *(&now);
1628 			}
1629 
1630 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
1631 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
1632 				goto skip;
1633 			}
1634 
1635 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
1636 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
1637 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
1638 				mode = IFNET_MODEL_INPUT_POLL_OFF;
1639 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
1640 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
1641 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
1642 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
1643 				mode = IFNET_MODEL_INPUT_POLL_ON;
1644 			}
1645 
1646 			if (mode != ifp->if_poll_mode) {
1647 				ifp->if_poll_mode = mode;
1648 				*(&ifp->if_poll_mode_lasttime) = *(&now);
1649 				poll_req++;
1650 			}
1651 		}
1652 skip:
1653 		notify = dlil_input_stats_sync(ifp, inp);
1654 
1655 		lck_mtx_unlock(&inp->dlth_lock);
1656 
1657 		if (__improbable(embryonic)) {
1658 			ifnet_decr_pending_thread_count(ifp);
1659 		}
1660 
1661 		if (__improbable(notify)) {
1662 			ifnet_notify_data_threshold(ifp);
1663 		}
1664 
1665 		/*
1666 		 * If there's a mode change and interface is still attached,
1667 		 * perform a downcall to the driver for the new mode.  Also
1668 		 * hold an IO refcnt on the interface to prevent it from
1669 		 * being detached (will be release below.)
1670 		 */
1671 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
1672 			struct ifnet_model_params p = {
1673 				.model = mode, .reserved = { 0 }
1674 			};
1675 			errno_t err;
1676 
1677 			if (dlil_verbose) {
1678 				DLIL_PRINTF("%s: polling is now %s, "
1679 				    "pkts avg %d max %d limits [%d/%d], "
1680 				    "wreq avg %d limits [%d/%d], "
1681 				    "bytes avg %d limits [%d/%d]\n",
1682 				    if_name(ifp),
1683 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1684 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
1685 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
1686 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
1687 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
1688 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
1689 				    ifp->if_rxpoll_bhiwat);
1690 			}
1691 
1692 			if ((err = ((*ifp->if_input_ctl)(ifp,
1693 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
1694 				DLIL_PRINTF("%s: error setting polling mode "
1695 				    "to %s (%d)\n", if_name(ifp),
1696 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
1697 				    "ON" : "OFF", err);
1698 			}
1699 
1700 			switch (mode) {
1701 			case IFNET_MODEL_INPUT_POLL_OFF:
1702 				ifnet_set_poll_cycle(ifp, NULL);
1703 				ifp->if_rxpoll_offreq++;
1704 				if (err != 0) {
1705 					ifp->if_rxpoll_offerr++;
1706 				}
1707 				break;
1708 
1709 			case IFNET_MODEL_INPUT_POLL_ON:
1710 				net_nsectimer(&ival, &ts);
1711 				ifnet_set_poll_cycle(ifp, &ts);
1712 				ifnet_poll(ifp);
1713 				ifp->if_rxpoll_onreq++;
1714 				if (err != 0) {
1715 					ifp->if_rxpoll_onerr++;
1716 				}
1717 				break;
1718 
1719 			default:
1720 				VERIFY(0);
1721 				/* NOTREACHED */
1722 			}
1723 
1724 			/* Release the IO refcnt */
1725 			ifnet_decr_iorefcnt(ifp);
1726 		}
1727 
1728 		/*
1729 		 * NOTE warning %%% attention !!!!
1730 		 * We should think about putting some thread starvation
1731 		 * safeguards if we deal with long chains of packets.
1732 		 */
1733 		if (__probable(m != NULL)) {
1734 			dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
1735 		}
1736 
1737 		lck_mtx_lock_spin(&inp->dlth_lock);
1738 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
1739 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
1740 		    DLIL_INPUT_TERMINATE))) {
1741 			break;
1742 		}
1743 	}
1744 
1745 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
1746 
1747 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
1748 terminate:
1749 		lck_mtx_unlock(&inp->dlth_lock);
1750 		dlil_terminate_input_thread(inp);
1751 		/* NOTREACHED */
1752 	} else {
1753 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
1754 		lck_mtx_unlock(&inp->dlth_lock);
1755 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
1756 		    inp);
1757 		/* NOTREACHED */
1758 	}
1759 
1760 	VERIFY(0);      /* we should never get here */
1761 	/* NOTREACHED */
1762 	__builtin_unreachable();
1763 }
1764 
1765 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)1766 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
1767     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
1768 {
1769 	uint32_t overcommitted_qlen;    /* Length in packets. */
1770 	uint64_t overcommitted_qsize;   /* Size in bytes. */
1771 	uint32_t target_qlen;           /* The desired queue length after trimming. */
1772 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
1773 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
1774 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
1775 	struct mbuf *m = NULL, *m_tmp = NULL;
1776 
1777 	overcommitted_qlen = qlen(input_queue);
1778 	overcommitted_qsize = qsize(input_queue);
1779 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
1780 
1781 	if (overcommitted_qlen <= target_qlen) {
1782 		/*
1783 		 * The queue is already within the target limits.
1784 		 */
1785 		dropped_pkts = 0;
1786 		goto out;
1787 	}
1788 
1789 	pkts_to_drop = overcommitted_qlen - target_qlen;
1790 
1791 	/*
1792 	 * Proceed to removing packets from the head of the queue,
1793 	 * starting from the oldest, until the desired number of packets
1794 	 * has been dropped.
1795 	 */
1796 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
1797 		if (pkts_to_drop <= dropped_pkts) {
1798 			break;
1799 		}
1800 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
1801 		MBUFQ_NEXT(m) = NULL;
1802 		MBUFQ_ENQUEUE(freeq, m);
1803 
1804 		dropped_pkts += 1;
1805 		dropped_bytes += m_length(m);
1806 	}
1807 
1808 	/*
1809 	 * Adjust the length and the estimated size of the queue
1810 	 * after trimming.
1811 	 */
1812 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
1813 	qlen(input_queue) = target_qlen;
1814 
1815 	/* qsize() is an approximation. */
1816 	if (dropped_bytes < qsize(input_queue)) {
1817 		qsize(input_queue) -= dropped_bytes;
1818 	} else {
1819 		qsize(input_queue) = 0;
1820 	}
1821 
1822 	/*
1823 	 * Adjust the ifnet statistics increments, if needed.
1824 	 */
1825 	stat_delta->dropped += dropped_pkts;
1826 	if (dropped_pkts < stat_delta->packets_in) {
1827 		stat_delta->packets_in -= dropped_pkts;
1828 	} else {
1829 		stat_delta->packets_in = 0;
1830 	}
1831 	if (dropped_bytes < stat_delta->bytes_in) {
1832 		stat_delta->bytes_in -= dropped_bytes;
1833 	} else {
1834 		stat_delta->bytes_in = 0;
1835 	}
1836 
1837 out:
1838 	if (dlil_verbose) {
1839 		/*
1840 		 * The basic information about the drop is logged
1841 		 * by the invoking function (dlil_input_{,a}sync).
1842 		 * If `dlil_verbose' flag is set, provide more information
1843 		 * that can be useful for debugging.
1844 		 */
1845 		DLIL_PRINTF("%s: "
1846 		    "qlen: %u -> %u, "
1847 		    "qsize: %llu -> %llu "
1848 		    "qlimit: %u (sysctl: %u) "
1849 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
1850 		    "dropped_pkts: %u dropped_bytes %u\n",
1851 		    __func__,
1852 		    overcommitted_qlen, qlen(input_queue),
1853 		    overcommitted_qsize, qsize(input_queue),
1854 		    qlimit(input_queue), if_rcvq_burst_limit,
1855 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
1856 		    dropped_pkts, dropped_bytes);
1857 	}
1858 
1859 	return dropped_pkts;
1860 }
1861 
1862 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)1863 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
1864 {
1865 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1866 	if_flt_monitor_busy(ifp);
1867 	lck_mtx_unlock(&ifp->if_flt_lock);
1868 
1869 	if (ifp->if_bridge != NULL) {
1870 		m = bridge_early_input(ifp, m, cnt);
1871 	}
1872 	lck_mtx_lock_spin(&ifp->if_flt_lock);
1873 	if_flt_monitor_unbusy(ifp);
1874 	lck_mtx_unlock(&ifp->if_flt_lock);
1875 	return m;
1876 }
1877