xref: /xnu-11215.41.3/bsd/net/dlil.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/dlil_sysctl.h>
54 #include <net/dlil_var_private.h>
55 #include <net/if_arp.h>
56 #include <net/iptap.h>
57 #include <net/pktap.h>
58 #include <net/droptap.h>
59 #include <net/nwk_wq.h>
60 #include <sys/kern_event.h>
61 #include <sys/kdebug.h>
62 #include <sys/mcache.h>
63 #include <sys/syslog.h>
64 #include <sys/protosw.h>
65 #include <sys/priv.h>
66 
67 #include <kern/assert.h>
68 #include <kern/task.h>
69 #include <kern/thread.h>
70 #include <kern/sched_prim.h>
71 #include <kern/locks.h>
72 #include <kern/zalloc.h>
73 
74 #include <net/kpi_protocol.h>
75 #include <net/if_types.h>
76 #include <net/if_ipsec.h>
77 #include <net/if_llreach.h>
78 #include <net/if_utun.h>
79 #include <net/kpi_interfacefilter.h>
80 #include <net/classq/classq.h>
81 #include <net/classq/classq_sfb.h>
82 #include <net/flowhash.h>
83 #include <net/ntstat.h>
84 #if SKYWALK
85 #include <skywalk/lib/net_filter_event.h>
86 #endif /* SKYWALK */
87 #include <net/net_api_stats.h>
88 #include <net/if_ports_used.h>
89 #include <net/if_vlan_var.h>
90 #include <netinet/in.h>
91 #if INET
92 #include <netinet/in_var.h>
93 #include <netinet/igmp_var.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #include <netinet/if_ether.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_tclass.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip_icmp.h>
104 #include <netinet/icmp_var.h>
105 #endif /* INET */
106 
107 #include <net/nat464_utils.h>
108 #include <netinet6/in6_var.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/mld6_var.h>
111 #include <netinet6/scope6_var.h>
112 #include <netinet/ip6.h>
113 #include <netinet/icmp6.h>
114 #include <net/pf_pbuf.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/tree.h>
117 
118 #include <dev/random/randomdev.h>
119 #include <machine/machine_routines.h>
120 
121 #include <mach/thread_act.h>
122 #include <mach/sdt.h>
123 
124 #if CONFIG_MACF
125 #include <sys/kauth.h>
126 #include <security/mac_framework.h>
127 #include <net/ethernet.h>
128 #include <net/firewire.h>
129 #endif
130 
131 #if PF
132 #include <net/pfvar.h>
133 #endif /* PF */
134 #include <net/pktsched/pktsched.h>
135 #include <net/pktsched/pktsched_netem.h>
136 
137 #if NECP
138 #include <net/necp.h>
139 #endif /* NECP */
140 
141 #if SKYWALK
142 #include <skywalk/packet/packet_queue.h>
143 #include <skywalk/nexus/netif/nx_netif.h>
144 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
145 #endif /* SKYWALK */
146 
147 #include <net/sockaddr_utils.h>
148 
149 #include <os/log.h>
150 
151 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
152 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
153 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
154 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
155 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
156 
157 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
158 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
159 
160 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
161 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
162 
163 enum {
164 	kProtoKPI_v1    = 1,
165 	kProtoKPI_v2    = 2
166 };
167 
168 uint64_t if_creation_generation_count = 0;
169 
170 /*
171  * List of if_proto structures in if_proto_hash[] is protected by
172  * the ifnet lock.  The rest of the fields are initialized at protocol
173  * attach time and never change, thus no lock required as long as
174  * a reference to it is valid, via if_proto_ref().
175  */
176 struct if_proto {
177 	SLIST_ENTRY(if_proto)       next_hash;
178 	u_int32_t                   refcount;
179 	u_int32_t                   detached;
180 	struct ifnet                *ifp;
181 	protocol_family_t           protocol_family;
182 	int                         proto_kpi;
183 	union {
184 		struct {
185 			proto_media_input               input;
186 			proto_media_preout              pre_output;
187 			proto_media_event               event;
188 			proto_media_ioctl               ioctl;
189 			proto_media_detached            detached;
190 			proto_media_resolve_multi       resolve_multi;
191 			proto_media_send_arp            send_arp;
192 		} v1;
193 		struct {
194 			proto_media_input_v2            input;
195 			proto_media_preout              pre_output;
196 			proto_media_event               event;
197 			proto_media_ioctl               ioctl;
198 			proto_media_detached            detached;
199 			proto_media_resolve_multi       resolve_multi;
200 			proto_media_send_arp            send_arp;
201 		} v2;
202 	} kpi;
203 };
204 
205 SLIST_HEAD(proto_hash_entry, if_proto);
206 
207 #define DLIL_SDLDATALEN \
208 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
209 
210 /*
211  * In the common case, the LL address is stored in the
212  * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
213  * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
214  */
215 struct dl_if_lladdr_std {
216 	struct ifaddr   ifa;
217 	u_int8_t        addr_sdl_bytes[DLIL_SDLMAXLEN];
218 	u_int8_t        mask_sdl_bytes[DLIL_SDLMAXLEN];
219 };
220 
221 /*
222  * However, in some rare cases we encounter LL addresses which
223  * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
224  * we allocate the storage in the permanent arena, using this memory layout.
225  */
226 struct dl_if_lladdr_xtra_space {
227 	struct ifaddr   ifa;
228 	u_int8_t        addr_sdl_bytes[SOCK_MAXADDRLEN];
229 	u_int8_t        mask_sdl_bytes[SOCK_MAXADDRLEN];
230 };
231 
232 struct dlil_ifnet {
233 	struct ifnet    dl_if;                  /* public ifnet */
234 	/*
235 	 * DLIL private fields, protected by dl_if_lock
236 	 */
237 	decl_lck_mtx_data(, dl_if_lock);
238 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
239 	u_int32_t dl_if_flags;                  /* flags (below) */
240 	u_int32_t dl_if_refcnt;                 /* refcnt */
241 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
242 	void    *dl_if_uniqueid;                /* unique interface id */
243 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
244 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
245 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
246 	struct dl_if_lladdr_std dl_if_lladdr;   /* link-level address storage*/
247 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
248 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
249 	u_int8_t dl_if_permanent_ether_is_set;
250 	u_int8_t dl_if_unused;
251 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
252 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
253 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
254 };
255 
256 /* Values for dl_if_flags (private to DLIL) */
257 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
258 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
259 #define DLIF_DEBUG      0x4     /* has debugging info */
260 
261 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
262 
263 /* For gdb */
264 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
265 
266 struct dlil_ifnet_dbg {
267 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
268 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
269 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
270 	/*
271 	 * Circular lists of ifnet_{reference,release} callers.
272 	 */
273 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
274 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
275 };
276 
277 #define DLIL_TO_IFP(s)  (&s->dl_if)
278 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
279 
280 struct ifnet_filter {
281 	TAILQ_ENTRY(ifnet_filter)       filt_next;
282 	u_int32_t                       filt_skip;
283 	u_int32_t                       filt_flags;
284 	ifnet_t                         filt_ifp;
285 	const char                      *filt_name;
286 	void                            *filt_cookie;
287 	protocol_family_t               filt_protocol;
288 	iff_input_func                  filt_input;
289 	iff_output_func                 filt_output;
290 	iff_event_func                  filt_event;
291 	iff_ioctl_func                  filt_ioctl;
292 	iff_detached_func               filt_detached;
293 };
294 
295 /* Mbuf queue used for freeing the excessive mbufs */
296 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
297 
298 struct proto_input_entry;
299 
300 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
301 
302 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
303 
304 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
305 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
306 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
307 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
308 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
309 
310 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
311 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
312     &dlil_lck_attributes);
313 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
314     &dlil_lck_attributes);
315 
316 #if DEBUG
317 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
318 #else
319 static unsigned int ifnet_debug;        /* debugging (disabled) */
320 #endif /* !DEBUG */
321 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
322 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
323 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
324 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
325 
326 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
327 
328 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
329 
330 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
331 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
332 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
333 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
334 
335 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
336 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
337 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
338 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
339 
340 static u_int32_t net_rtref;
341 
342 static struct dlil_main_threading_info dlil_main_input_thread_info;
343 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
344     (struct dlil_threading_info *)&dlil_main_input_thread_info;
345 
346 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
347 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
348 static void dlil_if_trace(struct dlil_ifnet *, int);
349 static void if_proto_ref(struct if_proto *);
350 static void if_proto_free(struct if_proto *);
351 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
352 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
353     u_int32_t list_count);
354 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
355 static void if_flt_monitor_busy(struct ifnet *);
356 static void if_flt_monitor_unbusy(struct ifnet *);
357 static void if_flt_monitor_enter(struct ifnet *);
358 static void if_flt_monitor_leave(struct ifnet *);
359 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
360     char **, protocol_family_t, boolean_t);
361 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
362     protocol_family_t);
363 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
364     const struct sockaddr_dl *);
365 static int ifnet_lookup(struct ifnet *);
366 static void if_purgeaddrs(struct ifnet *);
367 
368 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
369     struct mbuf *, char *);
370 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
371     struct mbuf *);
372 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
373     mbuf_t *, const struct sockaddr *, void *, char *, char *);
374 static void ifproto_media_event(struct ifnet *, protocol_family_t,
375     const struct kev_msg *);
376 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
377     unsigned long, void *);
378 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
379     struct sockaddr_dl *, size_t);
380 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
381     const struct sockaddr_dl *, const struct sockaddr *,
382     const struct sockaddr_dl *, const struct sockaddr *);
383 
384 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
385     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
386     boolean_t poll, struct thread *tp);
387 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
388     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
389 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
390 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
391     protocol_family_t *);
392 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
393     const struct ifnet_demux_desc *, u_int32_t);
394 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
395 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
396 #if !XNU_TARGET_OS_OSX
397 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
398     const struct sockaddr *, const char *, const char *,
399     u_int32_t *, u_int32_t *);
400 #else /* XNU_TARGET_OS_OSX */
401 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
402     const struct sockaddr *, const char *, const char *);
403 #endif /* XNU_TARGET_OS_OSX */
404 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
405     const struct sockaddr *, const char *, const char *,
406     u_int32_t *, u_int32_t *);
407 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
408 static void ifp_if_free(struct ifnet *);
409 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
410 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
411 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
412 
413 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
414     dlil_freeq_t *, struct ifnet_stat_increment_param *);
415 
416 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
417     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
418     boolean_t, struct thread *);
419 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
420     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
421     boolean_t, struct thread *);
422 
423 static void dlil_main_input_thread_func(void *, wait_result_t);
424 static void dlil_main_input_thread_cont(void *, wait_result_t);
425 
426 static void dlil_input_thread_func(void *, wait_result_t);
427 static void dlil_input_thread_cont(void *, wait_result_t);
428 
429 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
430 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
431 
432 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
433     thread_continue_t *);
434 static void dlil_terminate_input_thread(struct dlil_threading_info *);
435 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
436     struct dlil_threading_info *, struct ifnet *, boolean_t);
437 static boolean_t dlil_input_stats_sync(struct ifnet *,
438     struct dlil_threading_info *);
439 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
440     u_int32_t, ifnet_model_t, boolean_t);
441 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
442     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
443 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
444 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
445 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
446 #if DEBUG || DEVELOPMENT
447 static void dlil_verify_sum16(void);
448 #endif /* DEBUG || DEVELOPMENT */
449 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
450     protocol_family_t);
451 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
452     protocol_family_t);
453 
454 static void dlil_incr_pending_thread_count(void);
455 static void dlil_decr_pending_thread_count(void);
456 
457 static void ifnet_detacher_thread_func(void *, wait_result_t);
458 static void ifnet_detacher_thread_cont(void *, wait_result_t);
459 static void ifnet_detach_final(struct ifnet *);
460 static void ifnet_detaching_enqueue(struct ifnet *);
461 static struct ifnet *ifnet_detaching_dequeue(void);
462 
463 static void ifnet_start_thread_func(void *, wait_result_t);
464 static void ifnet_start_thread_cont(void *, wait_result_t);
465 
466 static void ifnet_poll_thread_func(void *, wait_result_t);
467 static void ifnet_poll_thread_cont(void *, wait_result_t);
468 
469 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
470     classq_pkt_t *, boolean_t, boolean_t *);
471 
472 static void ifp_src_route_copyout(struct ifnet *, struct route *);
473 static void ifp_src_route_copyin(struct ifnet *, struct route *);
474 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
475 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
476 
477 static errno_t if_mcasts_update_async(struct ifnet *);
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522 
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525 
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527     u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532 
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540 
541 /* rate limit debug messages */
542 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
543 
544 static inline void
ifnet_delay_start_disabled_increment(void)545 ifnet_delay_start_disabled_increment(void)
546 {
547 	OSIncrementAtomic(&ifnet_delay_start_disabled);
548 }
549 
550 static void log_hexdump(void *data, size_t len);
551 
552 unsigned int net_rxpoll = 1;
553 unsigned int net_affinity = 1;
554 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
555 
556 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
557 
558 extern u_int32_t        inject_buckets;
559 
560 /* DLIL data threshold thread call */
561 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
562 
563 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)564 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
565 {
566 	/*
567 	 * update filter count and route_generation ID to let TCP
568 	 * know it should reevalute doing TSO or not
569 	 */
570 	if (filter_enable) {
571 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
572 	} else {
573 		VERIFY(ifp->if_flt_no_tso_count != 0);
574 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
575 	}
576 	routegenid_update();
577 }
578 
579 #if SKYWALK
580 
581 static bool net_check_compatible_if_filter(struct ifnet *ifp);
582 
583 /* if_attach_nx flags defined in os_skywalk_private.h */
584 unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
585 unsigned int if_enable_fsw_ip_netagent =
586     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
587 unsigned int if_enable_fsw_transport_netagent =
588     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
589 
590 unsigned int if_netif_all =
591     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
592 
593 /* Configure flowswitch to use max mtu sized buffer */
594 static bool fsw_use_max_mtu_buffer = false;
595 
596 
597 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
598 
599 #include <skywalk/os_skywalk_private.h>
600 
601 boolean_t
ifnet_nx_noauto(ifnet_t ifp)602 ifnet_nx_noauto(ifnet_t ifp)
603 {
604 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
605 }
606 
607 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)608 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
609 {
610 	return ifnet_is_low_latency(ifp);
611 }
612 
613 boolean_t
ifnet_is_low_latency(ifnet_t ifp)614 ifnet_is_low_latency(ifnet_t ifp)
615 {
616 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
617 }
618 
619 boolean_t
ifnet_needs_compat(ifnet_t ifp)620 ifnet_needs_compat(ifnet_t ifp)
621 {
622 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
623 		return FALSE;
624 	}
625 #if !XNU_TARGET_OS_OSX
626 	/*
627 	 * To conserve memory, we plumb in the compat layer selectively; this
628 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
629 	 * In particular, we check for Wi-Fi Access Point.
630 	 */
631 	if (IFNET_IS_WIFI(ifp)) {
632 		/* Wi-Fi Access Point */
633 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
634 		    ifp->if_name[2] == '\0') {
635 			return if_netif_all;
636 		}
637 	}
638 #else /* XNU_TARGET_OS_OSX */
639 #pragma unused(ifp)
640 #endif /* XNU_TARGET_OS_OSX */
641 	return TRUE;
642 }
643 
644 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)645 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
646 {
647 	if (if_is_fsw_transport_netagent_enabled()) {
648 		/* check if netagent has been manually enabled for ipsec/utun */
649 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
650 			return ipsec_interface_needs_netagent(ifp);
651 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
652 			return utun_interface_needs_netagent(ifp);
653 		}
654 
655 		/* check ifnet no auto nexus override */
656 		if (ifnet_nx_noauto(ifp)) {
657 			return FALSE;
658 		}
659 
660 		/* check global if_attach_nx configuration */
661 		switch (ifp->if_family) {
662 		case IFNET_FAMILY_CELLULAR:
663 		case IFNET_FAMILY_ETHERNET:
664 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
665 				return TRUE;
666 			}
667 			break;
668 		default:
669 			break;
670 		}
671 	}
672 	return FALSE;
673 }
674 
675 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)676 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
677 {
678 #pragma unused(ifp)
679 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
680 		return TRUE;
681 	}
682 	return FALSE;
683 }
684 
685 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)686 ifnet_needs_netif_netagent(ifnet_t ifp)
687 {
688 #pragma unused(ifp)
689 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
690 }
691 
692 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)693 dlil_detach_nexus_instance(nexus_controller_t controller,
694     const char *func_str, uuid_t instance, uuid_t device)
695 {
696 	errno_t         err;
697 
698 	if (instance == NULL || uuid_is_null(instance)) {
699 		return FALSE;
700 	}
701 
702 	/* followed by the device port */
703 	if (device != NULL && !uuid_is_null(device)) {
704 		err = kern_nexus_ifdetach(controller, instance, device);
705 		if (err != 0) {
706 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
707 			    func_str, err);
708 		}
709 	}
710 	err = kern_nexus_controller_free_provider_instance(controller,
711 	    instance);
712 	if (err != 0) {
713 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
714 		    func_str, err);
715 	}
716 	return TRUE;
717 }
718 
719 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)720 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
721     uuid_t device)
722 {
723 	boolean_t               detached = FALSE;
724 	nexus_controller_t      controller = kern_nexus_shared_controller();
725 	int                     err;
726 
727 	if (dlil_detach_nexus_instance(controller, func_str, instance,
728 	    device)) {
729 		detached = TRUE;
730 	}
731 	if (provider != NULL && !uuid_is_null(provider)) {
732 		detached = TRUE;
733 		err = kern_nexus_controller_deregister_provider(controller,
734 		    provider);
735 		if (err != 0) {
736 			DLIL_PRINTF("%s deregister_provider %d\n",
737 			    func_str, err);
738 		}
739 	}
740 	return detached;
741 }
742 
743 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)744 dlil_create_provider_and_instance(nexus_controller_t controller,
745     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
746     nexus_attr_t attr)
747 {
748 	uuid_t          dom_prov;
749 	errno_t         err;
750 	nexus_name_t    provider_name;
751 	const char      *type_name =
752 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
753 	struct kern_nexus_init init;
754 
755 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
756 	if (err != 0) {
757 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
758 		    __func__, type_name, err);
759 		goto failed;
760 	}
761 
762 	snprintf((char *)provider_name, sizeof(provider_name),
763 	    "com.apple.%s.%s", type_name, if_name(ifp));
764 	err = kern_nexus_controller_register_provider(controller,
765 	    dom_prov,
766 	    provider_name,
767 	    NULL,
768 	    0,
769 	    attr,
770 	    provider);
771 	if (err != 0) {
772 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
773 		    __func__, type_name, err);
774 		goto failed;
775 	}
776 	bzero(&init, sizeof(init));
777 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
778 	err = kern_nexus_controller_alloc_provider_instance(controller,
779 	    *provider,
780 	    NULL, NULL,
781 	    instance, &init);
782 	if (err != 0) {
783 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
784 		    __func__, type_name, err);
785 		kern_nexus_controller_deregister_provider(controller,
786 		    *provider);
787 		goto failed;
788 	}
789 failed:
790 	return err;
791 }
792 
793 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)794 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
795 {
796 	nexus_attr_t            attr = NULL;
797 	nexus_controller_t      controller;
798 	errno_t                 err;
799 
800 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
801 		/* it's already attached */
802 		if (dlil_verbose) {
803 			DLIL_PRINTF("%s: %s already has nexus attached\n",
804 			    __func__, if_name(ifp));
805 			/* already attached */
806 		}
807 		goto failed;
808 	}
809 
810 	err = kern_nexus_attr_create(&attr);
811 	if (err != 0) {
812 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
813 		    if_name(ifp));
814 		goto failed;
815 	}
816 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
817 	VERIFY(err == 0);
818 
819 	controller = kern_nexus_shared_controller();
820 
821 	/* create the netif provider and instance */
822 	err = dlil_create_provider_and_instance(controller,
823 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
824 	    &netif_nx->if_nif_instance, attr);
825 	if (err != 0) {
826 		goto failed;
827 	}
828 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
829 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
830 	if (err != 0) {
831 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
832 		    __func__, err);
833 		/* cleanup provider and instance */
834 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
835 		    netif_nx->if_nif_instance, NULL);
836 		goto failed;
837 	}
838 	return TRUE;
839 
840 failed:
841 	if (attr != NULL) {
842 		kern_nexus_attr_destroy(attr);
843 	}
844 	return FALSE;
845 }
846 
847 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)848 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
849 {
850 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
851 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
852 		goto failed;
853 	}
854 	switch (ifp->if_type) {
855 	case IFT_CELLULAR:
856 	case IFT_ETHER:
857 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
858 			/* don't auto-attach */
859 			goto failed;
860 		}
861 		break;
862 	default:
863 		/* don't auto-attach */
864 		goto failed;
865 	}
866 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
867 
868 failed:
869 	return FALSE;
870 }
871 
872 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)873 dlil_is_native_netif_nexus(ifnet_t ifp)
874 {
875 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
876 }
877 
878 __attribute__((noinline))
879 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)880 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
881 {
882 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
883 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
884 }
885 
886 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)887 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
888 {
889 	struct ifreq        ifr;
890 	int                 error;
891 
892 	bzero(&ifr, sizeof(ifr));
893 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
894 	if (error == 0) {
895 		*ifdm_p = ifr.ifr_devmtu;
896 	}
897 	return error;
898 }
899 
900 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)901 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
902 {
903 	uint32_t tso_v4_mtu = 0;
904 	uint32_t tso_v6_mtu = 0;
905 
906 	if (!kernel_is_macos_or_server()) {
907 		return;
908 	}
909 
910 	if (!dlil_is_native_netif_nexus(ifp)) {
911 		return;
912 	}
913 	/*
914 	 * Note that we are reading the real hwassist flags set by the driver
915 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
916 	 * hasn't been called yet.
917 	 */
918 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
919 		tso_v4_mtu = ifp->if_tso_v4_mtu;
920 	}
921 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
922 		tso_v6_mtu = ifp->if_tso_v6_mtu;
923 	}
924 	/*
925 	 * If the hardware supports TSO, adjust the large buf size to match the
926 	 * supported TSO MTU size.
927 	 */
928 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
929 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
930 	} else {
931 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
932 	}
933 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
934 }
935 
936 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)937 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
938     bool *use_multi_buflet, uint32_t *large_buf_size)
939 {
940 	struct kern_pbufpool_memory_info rx_pp_info;
941 	struct kern_pbufpool_memory_info tx_pp_info;
942 	uint32_t if_max_mtu = 0;
943 	uint32_t drv_buf_size;
944 	struct ifdevmtu ifdm;
945 	int err;
946 
947 	/*
948 	 * To perform intra-stack RX aggregation flowswitch needs to use
949 	 * multi-buflet packet.
950 	 */
951 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
952 
953 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
954 	/*
955 	 * IP over Thunderbolt interface can deliver the largest IP packet,
956 	 * but the driver advertises the MAX MTU as only 9K.
957 	 */
958 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
959 		if_max_mtu = IP_MAXPACKET;
960 		goto skip_mtu_ioctl;
961 	}
962 
963 	/* determine max mtu */
964 	bzero(&ifdm, sizeof(ifdm));
965 	err = dlil_siocgifdevmtu(ifp, &ifdm);
966 	if (__improbable(err != 0)) {
967 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
968 		    __func__, if_name(ifp));
969 		/* use default flowswitch buffer size */
970 		if_max_mtu = NX_FSW_BUFSIZE;
971 	} else {
972 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
973 		    ifdm.ifdm_max, ifdm.ifdm_current);
974 		/* rdar://problem/44589731 */
975 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
976 	}
977 
978 skip_mtu_ioctl:
979 	if (if_max_mtu == 0) {
980 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
981 		    __func__, if_name(ifp));
982 		return EINVAL;
983 	}
984 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
985 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
986 		    "max bufsize(%d)\n", __func__,
987 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
988 		return EINVAL;
989 	}
990 
991 	/*
992 	 * for skywalk native driver, consult the driver packet pool also.
993 	 */
994 	if (dlil_is_native_netif_nexus(ifp)) {
995 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
996 		    &tx_pp_info);
997 		if (err != 0) {
998 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
999 			    __func__, if_name(ifp));
1000 			return ENXIO;
1001 		}
1002 		drv_buf_size = tx_pp_info.kpm_bufsize *
1003 		    tx_pp_info.kpm_max_frags;
1004 		if (if_max_mtu > drv_buf_size) {
1005 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1006 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1007 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1008 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1009 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1010 			return EINVAL;
1011 		}
1012 	} else {
1013 		drv_buf_size = if_max_mtu;
1014 	}
1015 
1016 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1017 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1018 		*use_multi_buflet = true;
1019 		/* default flowswitch buffer size */
1020 		*buf_size = NX_FSW_BUFSIZE;
1021 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1022 	} else {
1023 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1024 	}
1025 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1026 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1027 	if (*buf_size >= *large_buf_size) {
1028 		*large_buf_size = 0;
1029 	}
1030 	return 0;
1031 }
1032 
1033 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1034 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1035 {
1036 	nexus_attr_t            attr = NULL;
1037 	nexus_controller_t      controller;
1038 	errno_t                 err = 0;
1039 	uuid_t                  netif;
1040 	uint32_t                buf_size = 0;
1041 	uint32_t                large_buf_size = 0;
1042 	bool                    multi_buflet;
1043 
1044 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1045 	    IFNET_IS_VMNET(ifp)) {
1046 		goto failed;
1047 	}
1048 
1049 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1050 		/* not possible to attach (netif native/compat not plumbed) */
1051 		goto failed;
1052 	}
1053 
1054 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1055 		/* don't auto-attach */
1056 		goto failed;
1057 	}
1058 
1059 	/* get the netif instance from the ifp */
1060 	err = kern_nexus_get_netif_instance(ifp, netif);
1061 	if (err != 0) {
1062 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1063 		    if_name(ifp));
1064 		goto failed;
1065 	}
1066 
1067 	err = kern_nexus_attr_create(&attr);
1068 	if (err != 0) {
1069 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1070 		    if_name(ifp));
1071 		goto failed;
1072 	}
1073 
1074 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1075 	    &multi_buflet, &large_buf_size);
1076 	if (err != 0) {
1077 		goto failed;
1078 	}
1079 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1080 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1081 
1082 	/* Configure flowswitch buffer size */
1083 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1084 	VERIFY(err == 0);
1085 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1086 	    large_buf_size);
1087 	VERIFY(err == 0);
1088 
1089 	/*
1090 	 * Configure flowswitch to use super-packet (multi-buflet).
1091 	 */
1092 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1093 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1094 	VERIFY(err == 0);
1095 
1096 	/* create the flowswitch provider and instance */
1097 	controller = kern_nexus_shared_controller();
1098 	err = dlil_create_provider_and_instance(controller,
1099 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1100 	    &nexus_fsw->if_fsw_instance, attr);
1101 	if (err != 0) {
1102 		goto failed;
1103 	}
1104 
1105 	/* attach the device port */
1106 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1107 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1108 	if (err != 0) {
1109 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1110 		    __func__, err, if_name(ifp));
1111 		/* cleanup provider and instance */
1112 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1113 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1114 		goto failed;
1115 	}
1116 	return TRUE;
1117 
1118 failed:
1119 	if (err != 0) {
1120 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1121 		    __func__, if_name(ifp), err);
1122 	} else {
1123 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1124 		    __func__, if_name(ifp));
1125 	}
1126 	if (attr != NULL) {
1127 		kern_nexus_attr_destroy(attr);
1128 	}
1129 	return FALSE;
1130 }
1131 
1132 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1133 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1134 {
1135 	boolean_t               attached;
1136 	if_nexus_flowswitch     nexus_fsw;
1137 
1138 #if (DEVELOPMENT || DEBUG)
1139 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1140 		DLIL_PRINTF("skip attaching fsw to %s\n", if_name(ifp));
1141 		return FALSE;
1142 	}
1143 #endif /* (DEVELOPMENT || DEBUG) */
1144 
1145 	/*
1146 	 * flowswitch attachment is not supported for interface using the
1147 	 * legacy model (IFNET_INIT_LEGACY)
1148 	 */
1149 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1150 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model\n",
1151 		    if_name(ifp));
1152 		return FALSE;
1153 	}
1154 
1155 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1156 		/* it's already attached */
1157 		return FALSE;
1158 	}
1159 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1160 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1161 	if (attached) {
1162 		ifnet_lock_exclusive(ifp);
1163 		if (!IF_FULLY_ATTACHED(ifp)) {
1164 			/* interface is going away */
1165 			attached = FALSE;
1166 		} else {
1167 			ifp->if_nx_flowswitch = nexus_fsw;
1168 		}
1169 		ifnet_lock_done(ifp);
1170 		if (!attached) {
1171 			/* clean up flowswitch nexus */
1172 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1173 		}
1174 	}
1175 	return attached;
1176 }
1177 
1178 __attribute__((noinline))
1179 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1180 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1181 {
1182 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1183 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1184 }
1185 
1186 __attribute__((noinline))
1187 static void
dlil_netif_detach_notify(ifnet_t ifp)1188 dlil_netif_detach_notify(ifnet_t ifp)
1189 {
1190 	ifnet_detach_notify_cb_t notify = NULL;
1191 	void *arg = NULL;
1192 
1193 	ifnet_get_detach_notify(ifp, &notify, &arg);
1194 	if (notify == NULL) {
1195 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1196 		return;
1197 	}
1198 	(*notify)(arg);
1199 }
1200 
1201 __attribute__((noinline))
1202 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1203 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1204 {
1205 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1206 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1207 
1208 	ifnet_datamov_suspend_and_drain(ifp);
1209 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1210 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1211 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1212 		dlil_detach_flowswitch_nexus(nx_fsw);
1213 		bzero(nx_fsw, sizeof(*nx_fsw));
1214 	} else {
1215 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1216 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1217 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1218 	}
1219 
1220 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1221 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1222 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1223 		dlil_detach_netif_nexus(nx_netif);
1224 		bzero(nx_netif, sizeof(*nx_netif));
1225 	} else {
1226 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1227 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1228 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1229 	}
1230 	ifnet_datamov_resume(ifp);
1231 }
1232 
1233 boolean_t
ifnet_add_netagent(ifnet_t ifp)1234 ifnet_add_netagent(ifnet_t ifp)
1235 {
1236 	int     error;
1237 
1238 	error = kern_nexus_interface_add_netagent(ifp);
1239 	os_log(OS_LOG_DEFAULT,
1240 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1241 	    ifp->if_xname, error);
1242 	return error == 0;
1243 }
1244 
1245 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1246 ifnet_remove_netagent(ifnet_t ifp)
1247 {
1248 	int     error;
1249 
1250 	error = kern_nexus_interface_remove_netagent(ifp);
1251 	os_log(OS_LOG_DEFAULT,
1252 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1253 	    ifp->if_xname, error);
1254 	return error == 0;
1255 }
1256 
1257 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1258 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1259 {
1260 	if (!IF_FULLY_ATTACHED(ifp)) {
1261 		return FALSE;
1262 	}
1263 	return dlil_attach_flowswitch_nexus(ifp);
1264 }
1265 
1266 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1267 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1268 {
1269 	if_nexus_flowswitch     nexus_fsw;
1270 
1271 	ifnet_lock_exclusive(ifp);
1272 	nexus_fsw = ifp->if_nx_flowswitch;
1273 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1274 	ifnet_lock_done(ifp);
1275 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1276 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1277 }
1278 
1279 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1280 ifnet_attach_native_flowswitch(ifnet_t ifp)
1281 {
1282 	if (!dlil_is_native_netif_nexus(ifp)) {
1283 		/* not a native netif */
1284 		return;
1285 	}
1286 	ifnet_attach_flowswitch_nexus(ifp);
1287 }
1288 
1289 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1290 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1291 {
1292 	lck_mtx_lock(&ifp->if_delegate_lock);
1293 	while (ifp->if_fsw_rx_cb_ref > 0) {
1294 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1295 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1296 		    (PZERO + 1), __FUNCTION__, NULL);
1297 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1298 	}
1299 	ifp->if_fsw_rx_cb = cb;
1300 	ifp->if_fsw_rx_cb_arg = arg;
1301 	lck_mtx_unlock(&ifp->if_delegate_lock);
1302 	return 0;
1303 }
1304 
1305 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1306 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1307 {
1308 	/*
1309 	 * This is for avoiding the unnecessary lock acquire for interfaces
1310 	 * not used by a redirect interface.
1311 	 */
1312 	if (ifp->if_fsw_rx_cb == NULL) {
1313 		return ENOENT;
1314 	}
1315 	lck_mtx_lock(&ifp->if_delegate_lock);
1316 	if (ifp->if_fsw_rx_cb == NULL) {
1317 		lck_mtx_unlock(&ifp->if_delegate_lock);
1318 		return ENOENT;
1319 	}
1320 	*cbp = ifp->if_fsw_rx_cb;
1321 	*argp = ifp->if_fsw_rx_cb_arg;
1322 	ifp->if_fsw_rx_cb_ref++;
1323 	lck_mtx_unlock(&ifp->if_delegate_lock);
1324 	return 0;
1325 }
1326 
1327 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1328 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1329 {
1330 	lck_mtx_lock(&ifp->if_delegate_lock);
1331 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1332 		wakeup(&ifp->if_fsw_rx_cb_ref);
1333 	}
1334 	lck_mtx_unlock(&ifp->if_delegate_lock);
1335 }
1336 
1337 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1338 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1339 {
1340 	lck_mtx_lock(&difp->if_delegate_lock);
1341 	while (difp->if_delegate_parent_ref > 0) {
1342 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1343 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1344 		    (PZERO + 1), __FUNCTION__, NULL);
1345 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1346 	}
1347 	difp->if_delegate_parent = parent;
1348 	lck_mtx_unlock(&difp->if_delegate_lock);
1349 	return 0;
1350 }
1351 
1352 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1353 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1354 {
1355 	lck_mtx_lock(&difp->if_delegate_lock);
1356 	if (difp->if_delegate_parent == NULL) {
1357 		lck_mtx_unlock(&difp->if_delegate_lock);
1358 		return ENOENT;
1359 	}
1360 	*parentp = difp->if_delegate_parent;
1361 	difp->if_delegate_parent_ref++;
1362 	lck_mtx_unlock(&difp->if_delegate_lock);
1363 	return 0;
1364 }
1365 
1366 void
ifnet_release_delegate_parent(ifnet_t difp)1367 ifnet_release_delegate_parent(ifnet_t difp)
1368 {
1369 	lck_mtx_lock(&difp->if_delegate_lock);
1370 	if (--difp->if_delegate_parent_ref == 0) {
1371 		wakeup(&difp->if_delegate_parent_ref);
1372 	}
1373 	lck_mtx_unlock(&difp->if_delegate_lock);
1374 }
1375 
1376 __attribute__((noinline))
1377 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1378 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1379 {
1380 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1381 	ifp->if_detach_notify = notify;
1382 	ifp->if_detach_notify_arg = arg;
1383 }
1384 
1385 __attribute__((noinline))
1386 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1387 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1388 {
1389 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1390 	*notifyp = ifp->if_detach_notify;
1391 	*argp = ifp->if_detach_notify_arg;
1392 }
1393 
1394 __attribute__((noinline))
1395 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1396 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1397 {
1398 	ifnet_lock_exclusive(ifp);
1399 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1400 	ifnet_lock_done(ifp);
1401 }
1402 
1403 __attribute__((noinline))
1404 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1405 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1406 {
1407 	ifnet_lock_exclusive(ifp);
1408 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1409 	ifnet_lock_done(ifp);
1410 }
1411 #endif /* SKYWALK */
1412 
1413 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1414 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1415 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1416 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1417 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1418 	/* NOTREACHED */                                        \
1419 	}                                                               \
1420 }
1421 
1422 #define DLIL_EWMA(old, new, decay) do {                                 \
1423 	u_int32_t _avg;                                                 \
1424 	if ((_avg = (old)) > 0)                                         \
1425 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1426 	else                                                            \
1427 	        _avg = (new);                                           \
1428 	(old) = _avg;                                                   \
1429 } while (0)
1430 
1431 #define MBPS    (1ULL * 1000 * 1000)
1432 #define GBPS    (MBPS * 1000)
1433 
1434 struct rxpoll_time_tbl {
1435 	u_int64_t       speed;          /* downlink speed */
1436 	u_int32_t       plowat;         /* packets low watermark */
1437 	u_int32_t       phiwat;         /* packets high watermark */
1438 	u_int32_t       blowat;         /* bytes low watermark */
1439 	u_int32_t       bhiwat;         /* bytes high watermark */
1440 };
1441 
1442 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1443 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1444 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1445 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1446 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1447 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1448 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1449 };
1450 
1451 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1452     &dlil_lck_attributes);
1453 static uint32_t dlil_pending_thread_cnt = 0;
1454 
1455 static void
dlil_incr_pending_thread_count(void)1456 dlil_incr_pending_thread_count(void)
1457 {
1458 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1459 	lck_mtx_lock(&dlil_thread_sync_lock);
1460 	dlil_pending_thread_cnt++;
1461 	lck_mtx_unlock(&dlil_thread_sync_lock);
1462 }
1463 
1464 static void
dlil_decr_pending_thread_count(void)1465 dlil_decr_pending_thread_count(void)
1466 {
1467 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1468 	lck_mtx_lock(&dlil_thread_sync_lock);
1469 	VERIFY(dlil_pending_thread_cnt > 0);
1470 	dlil_pending_thread_cnt--;
1471 	if (dlil_pending_thread_cnt == 0) {
1472 		wakeup(&dlil_pending_thread_cnt);
1473 	}
1474 	lck_mtx_unlock(&dlil_thread_sync_lock);
1475 }
1476 
1477 int
proto_hash_value(u_int32_t protocol_family)1478 proto_hash_value(u_int32_t protocol_family)
1479 {
1480 	/*
1481 	 * dlil_proto_unplumb_all() depends on the mapping between
1482 	 * the hash bucket index and the protocol family defined
1483 	 * here; future changes must be applied there as well.
1484 	 */
1485 	switch (protocol_family) {
1486 	case PF_INET:
1487 		return 0;
1488 	case PF_INET6:
1489 		return 1;
1490 	case PF_VLAN:
1491 		return 2;
1492 	case PF_UNSPEC:
1493 	default:
1494 		return 3;
1495 	}
1496 }
1497 
1498 /*
1499  * Caller must already be holding ifnet lock.
1500  */
1501 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1502 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1503 {
1504 	struct if_proto *proto = NULL;
1505 	u_int32_t i = proto_hash_value(protocol_family);
1506 
1507 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1508 
1509 	if (ifp->if_proto_hash != NULL) {
1510 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1511 	}
1512 
1513 	while (proto != NULL && proto->protocol_family != protocol_family) {
1514 		proto = SLIST_NEXT(proto, next_hash);
1515 	}
1516 
1517 	if (proto != NULL) {
1518 		if_proto_ref(proto);
1519 	}
1520 
1521 	return proto;
1522 }
1523 
1524 static void
if_proto_ref(struct if_proto * proto)1525 if_proto_ref(struct if_proto *proto)
1526 {
1527 	os_atomic_inc(&proto->refcount, relaxed);
1528 }
1529 
1530 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1531 
1532 static void
if_proto_free(struct if_proto * proto)1533 if_proto_free(struct if_proto *proto)
1534 {
1535 	u_int32_t oldval;
1536 	struct ifnet *ifp = proto->ifp;
1537 	u_int32_t proto_family = proto->protocol_family;
1538 	struct kev_dl_proto_data ev_pr_data;
1539 
1540 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1541 	if (oldval > 1) {
1542 		return;
1543 	}
1544 
1545 	if (proto->proto_kpi == kProtoKPI_v1) {
1546 		if (proto->kpi.v1.detached) {
1547 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1548 		}
1549 	}
1550 	if (proto->proto_kpi == kProtoKPI_v2) {
1551 		if (proto->kpi.v2.detached) {
1552 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1553 		}
1554 	}
1555 
1556 	/*
1557 	 * Cleanup routes that may still be in the routing table for that
1558 	 * interface/protocol pair.
1559 	 */
1560 	if_rtproto_del(ifp, proto_family);
1561 
1562 	ifnet_lock_shared(ifp);
1563 
1564 	/* No more reference on this, protocol must have been detached */
1565 	VERIFY(proto->detached);
1566 
1567 	/*
1568 	 * The reserved field carries the number of protocol still attached
1569 	 * (subject to change)
1570 	 */
1571 	ev_pr_data.proto_family = proto_family;
1572 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1573 
1574 	ifnet_lock_done(ifp);
1575 
1576 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1577 	    (struct net_event_data *)&ev_pr_data,
1578 	    sizeof(struct kev_dl_proto_data), FALSE);
1579 
1580 	if (ev_pr_data.proto_remaining_count == 0) {
1581 		/*
1582 		 * The protocol count has gone to zero, mark the interface down.
1583 		 * This used to be done by configd.KernelEventMonitor, but that
1584 		 * is inherently prone to races (rdar://problem/30810208).
1585 		 */
1586 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1587 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1588 		dlil_post_sifflags_msg(ifp);
1589 	}
1590 
1591 	zfree(dlif_proto_zone, proto);
1592 }
1593 
1594 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1595 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1596 {
1597 #if !MACH_ASSERT
1598 #pragma unused(ifp)
1599 #endif
1600 	unsigned int type = 0;
1601 	int ass = 1;
1602 
1603 	switch (what) {
1604 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1605 		type = LCK_RW_ASSERT_EXCLUSIVE;
1606 		break;
1607 
1608 	case IFNET_LCK_ASSERT_SHARED:
1609 		type = LCK_RW_ASSERT_SHARED;
1610 		break;
1611 
1612 	case IFNET_LCK_ASSERT_OWNED:
1613 		type = LCK_RW_ASSERT_HELD;
1614 		break;
1615 
1616 	case IFNET_LCK_ASSERT_NOTOWNED:
1617 		/* nothing to do here for RW lock; bypass assert */
1618 		ass = 0;
1619 		break;
1620 
1621 	default:
1622 		panic("bad ifnet assert type: %d", what);
1623 		/* NOTREACHED */
1624 	}
1625 	if (ass) {
1626 		LCK_RW_ASSERT(&ifp->if_lock, type);
1627 	}
1628 }
1629 
1630 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1631 ifnet_lock_shared(struct ifnet *ifp)
1632 {
1633 	lck_rw_lock_shared(&ifp->if_lock);
1634 }
1635 
1636 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1637 ifnet_lock_exclusive(struct ifnet *ifp)
1638 {
1639 	lck_rw_lock_exclusive(&ifp->if_lock);
1640 }
1641 
1642 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1643 ifnet_lock_done(struct ifnet *ifp)
1644 {
1645 	lck_rw_done(&ifp->if_lock);
1646 }
1647 
1648 #if INET
1649 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1650 if_inetdata_lock_shared(struct ifnet *ifp)
1651 {
1652 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1653 }
1654 
1655 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1656 if_inetdata_lock_exclusive(struct ifnet *ifp)
1657 {
1658 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1659 }
1660 
1661 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1662 if_inetdata_lock_done(struct ifnet *ifp)
1663 {
1664 	lck_rw_done(&ifp->if_inetdata_lock);
1665 }
1666 #endif
1667 
1668 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1669 if_inet6data_lock_shared(struct ifnet *ifp)
1670 {
1671 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1672 }
1673 
1674 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1675 if_inet6data_lock_exclusive(struct ifnet *ifp)
1676 {
1677 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1678 }
1679 
1680 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1681 if_inet6data_lock_done(struct ifnet *ifp)
1682 {
1683 	lck_rw_done(&ifp->if_inet6data_lock);
1684 }
1685 
1686 __private_extern__ void
ifnet_head_lock_shared(void)1687 ifnet_head_lock_shared(void)
1688 {
1689 	lck_rw_lock_shared(&ifnet_head_lock);
1690 }
1691 
1692 __private_extern__ void
ifnet_head_lock_exclusive(void)1693 ifnet_head_lock_exclusive(void)
1694 {
1695 	lck_rw_lock_exclusive(&ifnet_head_lock);
1696 }
1697 
1698 __private_extern__ void
ifnet_head_done(void)1699 ifnet_head_done(void)
1700 {
1701 	lck_rw_done(&ifnet_head_lock);
1702 }
1703 
1704 __private_extern__ void
ifnet_head_assert_exclusive(void)1705 ifnet_head_assert_exclusive(void)
1706 {
1707 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1708 }
1709 
1710 /*
1711  * dlil_ifp_protolist
1712  * - get the list of protocols attached to the interface, or just the number
1713  *   of attached protocols
1714  * - if the number returned is greater than 'list_count', truncation occurred
1715  *
1716  * Note:
1717  * - caller must already be holding ifnet lock.
1718  */
1719 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1720 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1721     u_int32_t list_count)
1722 {
1723 	u_int32_t       count = 0;
1724 	int             i;
1725 
1726 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1727 
1728 	if (ifp->if_proto_hash == NULL) {
1729 		goto done;
1730 	}
1731 
1732 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1733 		struct if_proto *proto;
1734 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1735 			if (list != NULL && count < list_count) {
1736 				list[count] = proto->protocol_family;
1737 			}
1738 			count++;
1739 		}
1740 	}
1741 done:
1742 	return count;
1743 }
1744 
1745 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1746 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1747 {
1748 	ifnet_lock_shared(ifp);
1749 	count = dlil_ifp_protolist(ifp, protolist, count);
1750 	ifnet_lock_done(ifp);
1751 	return count;
1752 }
1753 
1754 __private_extern__ void
if_free_protolist(u_int32_t * list)1755 if_free_protolist(u_int32_t *list)
1756 {
1757 	kfree_data_addr(list);
1758 }
1759 
1760 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1761 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1762     u_int32_t event_code, struct net_event_data *event_data,
1763     u_int32_t event_data_len, boolean_t suppress_generation)
1764 {
1765 	struct net_event_data ev_data;
1766 	struct kev_msg ev_msg;
1767 
1768 	bzero(&ev_msg, sizeof(ev_msg));
1769 	bzero(&ev_data, sizeof(ev_data));
1770 	/*
1771 	 * a net event always starts with a net_event_data structure
1772 	 * but the caller can generate a simple net event or
1773 	 * provide a longer event structure to post
1774 	 */
1775 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1776 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1777 	ev_msg.kev_subclass     = event_subclass;
1778 	ev_msg.event_code       = event_code;
1779 
1780 	if (event_data == NULL) {
1781 		event_data = &ev_data;
1782 		event_data_len = sizeof(struct net_event_data);
1783 	}
1784 
1785 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1786 	event_data->if_family = ifp->if_family;
1787 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
1788 
1789 	ev_msg.dv[0].data_length = event_data_len;
1790 	ev_msg.dv[0].data_ptr    = event_data;
1791 	ev_msg.dv[1].data_length = 0;
1792 
1793 	bool update_generation = true;
1794 	if (event_subclass == KEV_DL_SUBCLASS) {
1795 		/* Don't update interface generation for frequent link quality and state changes  */
1796 		switch (event_code) {
1797 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1798 		case KEV_DL_RRC_STATE_CHANGED:
1799 		case KEV_DL_PRIMARY_ELECTED:
1800 			update_generation = false;
1801 			break;
1802 		default:
1803 			break;
1804 		}
1805 	}
1806 
1807 	/*
1808 	 * Some events that update generation counts might
1809 	 * want to suppress generation count.
1810 	 * One example is node presence/absence where we still
1811 	 * issue kernel event for the invocation but want to avoid
1812 	 * expensive operation of updating generation which triggers
1813 	 * NECP client updates.
1814 	 */
1815 	if (suppress_generation) {
1816 		update_generation = false;
1817 	}
1818 
1819 	return dlil_event_internal(ifp, &ev_msg, update_generation);
1820 }
1821 
1822 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1823 dlil_alloc_local_stats(struct ifnet *ifp)
1824 {
1825 	int ret = EINVAL;
1826 	void *buf, *base, **pbuf;
1827 
1828 	if (ifp == NULL) {
1829 		goto end;
1830 	}
1831 
1832 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1833 		/* allocate tcpstat_local structure */
1834 		buf = zalloc_flags(dlif_tcpstat_zone,
1835 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1836 
1837 		/* Get the 64-bit aligned base address for this object */
1838 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1839 		    sizeof(u_int64_t));
1840 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1841 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1842 
1843 		/*
1844 		 * Wind back a pointer size from the aligned base and
1845 		 * save the original address so we can free it later.
1846 		 */
1847 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1848 		*pbuf = buf;
1849 		ifp->if_tcp_stat = base;
1850 
1851 		/* allocate udpstat_local structure */
1852 		buf = zalloc_flags(dlif_udpstat_zone,
1853 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1854 
1855 		/* Get the 64-bit aligned base address for this object */
1856 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1857 		    sizeof(u_int64_t));
1858 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1859 		    ((intptr_t)buf + dlif_udpstat_bufsize));
1860 
1861 		/*
1862 		 * Wind back a pointer size from the aligned base and
1863 		 * save the original address so we can free it later.
1864 		 */
1865 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1866 		*pbuf = buf;
1867 		ifp->if_udp_stat = base;
1868 
1869 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1870 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1871 
1872 		ret = 0;
1873 	}
1874 
1875 	if (ifp->if_ipv4_stat == NULL) {
1876 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1877 	}
1878 
1879 	if (ifp->if_ipv6_stat == NULL) {
1880 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1881 	}
1882 end:
1883 	if (ifp != NULL && ret != 0) {
1884 		if (ifp->if_tcp_stat != NULL) {
1885 			pbuf = (void **)
1886 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1887 			zfree(dlif_tcpstat_zone, *pbuf);
1888 			ifp->if_tcp_stat = NULL;
1889 		}
1890 		if (ifp->if_udp_stat != NULL) {
1891 			pbuf = (void **)
1892 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1893 			zfree(dlif_udpstat_zone, *pbuf);
1894 			ifp->if_udp_stat = NULL;
1895 		}
1896 		/* The macro kfree_type sets the passed pointer to NULL */
1897 		if (ifp->if_ipv4_stat != NULL) {
1898 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
1899 		}
1900 		if (ifp->if_ipv6_stat != NULL) {
1901 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
1902 		}
1903 	}
1904 
1905 	return ret;
1906 }
1907 
1908 static void
dlil_reset_rxpoll_params(ifnet_t ifp)1909 dlil_reset_rxpoll_params(ifnet_t ifp)
1910 {
1911 	ASSERT(ifp != NULL);
1912 	ifnet_set_poll_cycle(ifp, NULL);
1913 	ifp->if_poll_update = 0;
1914 	ifp->if_poll_flags = 0;
1915 	ifp->if_poll_req = 0;
1916 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
1917 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
1918 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
1919 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
1920 	net_timerclear(&ifp->if_poll_mode_holdtime);
1921 	net_timerclear(&ifp->if_poll_mode_lasttime);
1922 	net_timerclear(&ifp->if_poll_sample_holdtime);
1923 	net_timerclear(&ifp->if_poll_sample_lasttime);
1924 	net_timerclear(&ifp->if_poll_dbg_lasttime);
1925 }
1926 
1927 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)1928 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
1929     thread_continue_t *thfunc)
1930 {
1931 	boolean_t dlil_rxpoll_input;
1932 	thread_continue_t func = NULL;
1933 	u_int32_t limit;
1934 	int error = 0;
1935 
1936 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
1937 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
1938 
1939 	/* default strategy utilizes the DLIL worker thread */
1940 	inp->dlth_strategy = dlil_input_async;
1941 
1942 	/* NULL ifp indicates the main input thread, called at dlil_init time */
1943 	if (ifp == NULL) {
1944 		/*
1945 		 * Main input thread only.
1946 		 */
1947 		func = dlil_main_input_thread_func;
1948 		VERIFY(inp == dlil_main_input_thread);
1949 		(void) strlcat(inp->dlth_name,
1950 		    "main_input", DLIL_THREADNAME_LEN);
1951 	} else if (dlil_rxpoll_input) {
1952 		/*
1953 		 * Legacy (non-netif) hybrid polling.
1954 		 */
1955 		func = dlil_rxpoll_input_thread_func;
1956 		VERIFY(inp != dlil_main_input_thread);
1957 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1958 		    "%s_input_poll", if_name(ifp));
1959 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
1960 		/*
1961 		 * Asynchronous strategy.
1962 		 */
1963 		func = dlil_input_thread_func;
1964 		VERIFY(inp != dlil_main_input_thread);
1965 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1966 		    "%s_input", if_name(ifp));
1967 	} else {
1968 		/*
1969 		 * Synchronous strategy if there's a netif below and
1970 		 * the device isn't capable of hybrid polling.
1971 		 */
1972 		ASSERT(func == NULL);
1973 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
1974 		VERIFY(inp != dlil_main_input_thread);
1975 		ASSERT(!inp->dlth_affinity);
1976 		inp->dlth_strategy = dlil_input_sync;
1977 	}
1978 	VERIFY(inp->dlth_thread == THREAD_NULL);
1979 
1980 	/* let caller know */
1981 	if (thfunc != NULL) {
1982 		*thfunc = func;
1983 	}
1984 
1985 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
1986 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
1987 
1988 	inp->dlth_ifp = ifp; /* NULL for main input thread */
1989 
1990 	/*
1991 	 * For interfaces that support opportunistic polling, set the
1992 	 * low and high watermarks for outstanding inbound packets/bytes.
1993 	 * Also define freeze times for transitioning between modes
1994 	 * and updating the average.
1995 	 */
1996 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1997 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1998 		if (ifp->if_xflags & IFXF_LEGACY) {
1999 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2000 		}
2001 	} else {
2002 		/*
2003 		 * For interfaces that don't support opportunistic
2004 		 * polling, set the burst limit to prevent memory exhaustion.
2005 		 * The values of `if_rcvq_burst_limit' are safeguarded
2006 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2007 		 */
2008 		limit = if_rcvq_burst_limit;
2009 	}
2010 
2011 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2012 	if (inp == dlil_main_input_thread) {
2013 		struct dlil_main_threading_info *inpm =
2014 		    (struct dlil_main_threading_info *)inp;
2015 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2016 	}
2017 
2018 	if (func == NULL) {
2019 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2020 		ASSERT(error == 0);
2021 		error = ENODEV;
2022 		goto done;
2023 	}
2024 
2025 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2026 	if (error == KERN_SUCCESS) {
2027 		thread_precedence_policy_data_t info;
2028 		__unused kern_return_t kret;
2029 
2030 		bzero(&info, sizeof(info));
2031 		info.importance = 0;
2032 		kret = thread_policy_set(inp->dlth_thread,
2033 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2034 		    THREAD_PRECEDENCE_POLICY_COUNT);
2035 		ASSERT(kret == KERN_SUCCESS);
2036 		/*
2037 		 * We create an affinity set so that the matching workloop
2038 		 * thread or the starter thread (for loopback) can be
2039 		 * scheduled on the same processor set as the input thread.
2040 		 */
2041 		if (net_affinity) {
2042 			struct thread *tp = inp->dlth_thread;
2043 			u_int32_t tag;
2044 			/*
2045 			 * Randomize to reduce the probability
2046 			 * of affinity tag namespace collision.
2047 			 */
2048 			read_frandom(&tag, sizeof(tag));
2049 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2050 				thread_reference(tp);
2051 				inp->dlth_affinity_tag = tag;
2052 				inp->dlth_affinity = TRUE;
2053 			}
2054 		}
2055 	} else if (inp == dlil_main_input_thread) {
2056 		panic_plain("%s: couldn't create main input thread", __func__);
2057 		/* NOTREACHED */
2058 	} else {
2059 		panic_plain("%s: couldn't create %s input thread", __func__,
2060 		    if_name(ifp));
2061 		/* NOTREACHED */
2062 	}
2063 	OSAddAtomic(1, &cur_dlil_input_threads);
2064 
2065 done:
2066 	return error;
2067 }
2068 
2069 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2070 dlil_clean_threading_info(struct dlil_threading_info *inp)
2071 {
2072 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2073 	lck_grp_free(inp->dlth_lock_grp);
2074 	inp->dlth_lock_grp = NULL;
2075 
2076 	inp->dlth_flags = 0;
2077 	inp->dlth_wtot = 0;
2078 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2079 	inp->dlth_ifp = NULL;
2080 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2081 	qlimit(&inp->dlth_pkts) = 0;
2082 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2083 
2084 	VERIFY(!inp->dlth_affinity);
2085 	inp->dlth_thread = THREAD_NULL;
2086 	inp->dlth_strategy = NULL;
2087 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2088 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2089 	VERIFY(inp->dlth_affinity_tag == 0);
2090 #if IFNET_INPUT_SANITY_CHK
2091 	inp->dlth_pkts_cnt = 0;
2092 #endif /* IFNET_INPUT_SANITY_CHK */
2093 }
2094 
2095 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2096 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2097 {
2098 	struct ifnet *ifp = inp->dlth_ifp;
2099 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2100 
2101 	VERIFY(current_thread() == inp->dlth_thread);
2102 	VERIFY(inp != dlil_main_input_thread);
2103 
2104 	OSAddAtomic(-1, &cur_dlil_input_threads);
2105 
2106 #if TEST_INPUT_THREAD_TERMINATION
2107 	{ /* do something useless that won't get optimized away */
2108 		uint32_t        v = 1;
2109 		for (uint32_t i = 0;
2110 		    i < if_input_thread_termination_spin;
2111 		    i++) {
2112 			v = (i + 1) * v;
2113 		}
2114 		DLIL_PRINTF("the value is %d\n", v);
2115 	}
2116 #endif /* TEST_INPUT_THREAD_TERMINATION */
2117 
2118 	lck_mtx_lock_spin(&inp->dlth_lock);
2119 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2120 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2121 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2122 	wakeup_one((caddr_t)&inp->dlth_flags);
2123 	lck_mtx_unlock(&inp->dlth_lock);
2124 
2125 	/* free up pending packets */
2126 	if (pkt.cp_mbuf != NULL) {
2127 		mbuf_freem_list(pkt.cp_mbuf);
2128 	}
2129 
2130 	/* for the extra refcnt from kernel_thread_start() */
2131 	thread_deallocate(current_thread());
2132 
2133 	if (dlil_verbose) {
2134 		DLIL_PRINTF("%s: input thread terminated\n",
2135 		    if_name(ifp));
2136 	}
2137 
2138 	/* this is the end */
2139 	thread_terminate(current_thread());
2140 	/* NOTREACHED */
2141 }
2142 
2143 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2144 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2145 {
2146 	thread_affinity_policy_data_t policy;
2147 
2148 	bzero(&policy, sizeof(policy));
2149 	policy.affinity_tag = tag;
2150 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2151 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2152 }
2153 
2154 #if SKYWALK
2155 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2156 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2157     enum net_filter_event_subsystems state)
2158 {
2159 	evhlog(debug, "%s: eventhandler saw event type=net_filter_event_state event_code=0x%d",
2160 	    __func__, state);
2161 
2162 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2163 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2164 		if_enable_fsw_transport_netagent = 1;
2165 	} else {
2166 		if_enable_fsw_transport_netagent = 0;
2167 	}
2168 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2169 		kern_nexus_update_netagents();
2170 	} else if (!if_enable_fsw_transport_netagent) {
2171 		necp_update_all_clients();
2172 	}
2173 }
2174 #endif /* SKYWALK */
2175 
2176 void
dlil_init(void)2177 dlil_init(void)
2178 {
2179 	thread_t thread = THREAD_NULL;
2180 
2181 	/*
2182 	 * The following fields must be 64-bit aligned for atomic operations.
2183 	 */
2184 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2185 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2186 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2187 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2188 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2189 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2190 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2191 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2192 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2193 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2194 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2195 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2196 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2197 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2198 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2199 
2200 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2201 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2202 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2203 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2204 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2205 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2206 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2207 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2208 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2209 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2210 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2211 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2212 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2213 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2214 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2215 
2216 	/*
2217 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2218 	 */
2219 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2220 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2221 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2222 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2223 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2224 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2225 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2226 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2227 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2228 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2229 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2230 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2231 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2232 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2233 
2234 	/*
2235 	 * ... as well as the mbuf checksum flags counterparts.
2236 	 */
2237 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2238 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2239 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2240 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2241 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2242 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2243 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2244 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2245 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2246 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2247 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2248 
2249 	/*
2250 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2251 	 */
2252 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2253 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2254 
2255 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2256 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2257 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2258 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2259 
2260 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2261 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2262 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2263 
2264 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2265 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2266 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2267 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2268 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2269 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2270 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2271 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2272 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2273 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2274 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2275 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2276 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2277 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2278 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2279 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2280 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2281 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2282 
2283 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2284 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2285 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2286 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2287 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2288 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2289 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2290 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2291 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2292 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2293 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2294 
2295 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2296 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2297 
2298 	PE_parse_boot_argn("net_affinity", &net_affinity,
2299 	    sizeof(net_affinity));
2300 
2301 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2302 
2303 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2304 
2305 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2306 
2307 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2308 
2309 	VERIFY(dlil_pending_thread_cnt == 0);
2310 #if SKYWALK
2311 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2312 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2313 	boolean_t enable_fsw_netagent =
2314 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2315 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2316 
2317 	/*
2318 	 * Check the device tree to see if Skywalk netagent has been explicitly
2319 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2320 	 * Note that the property is a 0-length key, and so checking for the
2321 	 * presence itself is enough (no need to check for the actual value of
2322 	 * the retrieved variable.)
2323 	 */
2324 	pe_enable_fsw_transport_netagent =
2325 	    PE_get_default("kern.skywalk_netagent_enable",
2326 	    &pe_enable_fsw_transport_netagent,
2327 	    sizeof(pe_enable_fsw_transport_netagent));
2328 	pe_disable_fsw_transport_netagent =
2329 	    PE_get_default("kern.skywalk_netagent_disable",
2330 	    &pe_disable_fsw_transport_netagent,
2331 	    sizeof(pe_disable_fsw_transport_netagent));
2332 
2333 	/*
2334 	 * These two are mutually exclusive, i.e. they both can be absent,
2335 	 * but only one can be present at a time, and so we assert to make
2336 	 * sure it is correct.
2337 	 */
2338 	VERIFY((!pe_enable_fsw_transport_netagent &&
2339 	    !pe_disable_fsw_transport_netagent) ||
2340 	    (pe_enable_fsw_transport_netagent ^
2341 	    pe_disable_fsw_transport_netagent));
2342 
2343 	if (pe_enable_fsw_transport_netagent) {
2344 		kprintf("SK: netagent is enabled via an override for "
2345 		    "this platform\n");
2346 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2347 	} else if (pe_disable_fsw_transport_netagent) {
2348 		kprintf("SK: netagent is disabled via an override for "
2349 		    "this platform\n");
2350 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2351 	} else {
2352 		kprintf("SK: netagent is %s by default for this platform\n",
2353 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2354 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2355 	}
2356 
2357 	/*
2358 	 * Now see if there's a boot-arg override.
2359 	 */
2360 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2361 	    sizeof(if_attach_nx));
2362 	if_enable_fsw_transport_netagent =
2363 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2364 
2365 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2366 
2367 	if (pe_disable_fsw_transport_netagent &&
2368 	    if_enable_fsw_transport_netagent) {
2369 		kprintf("SK: netagent is force-enabled\n");
2370 	} else if (!pe_disable_fsw_transport_netagent &&
2371 	    !if_enable_fsw_transport_netagent) {
2372 		kprintf("SK: netagent is force-disabled\n");
2373 	}
2374 	if (kernel_is_macos_or_server() && if_enable_fsw_transport_netagent) {
2375 		net_filter_event_register(dlil_filter_event);
2376 	}
2377 
2378 #if (DEVELOPMENT || DEBUG)
2379 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2380 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2381 #endif /* (DEVELOPMENT || DEBUG) */
2382 
2383 #endif /* SKYWALK */
2384 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2385 	    sizeof(struct dlil_ifnet_dbg);
2386 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2387 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2388 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2389 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2390 
2391 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2392 	/* Enforce 64-bit alignment for tcpstat_local structure */
2393 	dlif_tcpstat_bufsize =
2394 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2395 	dlif_tcpstat_bufsize = (uint32_t)
2396 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2397 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2398 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2399 
2400 	dlif_udpstat_size = sizeof(struct udpstat_local);
2401 	/* Enforce 64-bit alignment for udpstat_local structure */
2402 	dlif_udpstat_bufsize =
2403 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2404 	dlif_udpstat_bufsize = (uint32_t)
2405 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2406 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2407 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2408 
2409 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2410 
2411 	TAILQ_INIT(&dlil_ifnet_head);
2412 	TAILQ_INIT(&ifnet_head);
2413 	TAILQ_INIT(&ifnet_detaching_head);
2414 	TAILQ_INIT(&ifnet_ordered_head);
2415 
2416 	/* Initialize interface address subsystem */
2417 	ifa_init();
2418 
2419 #if PF
2420 	/* Initialize the packet filter */
2421 	pfinit();
2422 #endif /* PF */
2423 
2424 	/* Initialize queue algorithms */
2425 	classq_init();
2426 
2427 	/* Initialize packet schedulers */
2428 	pktsched_init();
2429 
2430 	/* Initialize flow advisory subsystem */
2431 	flowadv_init();
2432 
2433 	/* Initialize the pktap virtual interface */
2434 	pktap_init();
2435 
2436 	/* Initialize droptap interface */
2437 	droptap_init();
2438 
2439 	/* Initialize the service class to dscp map */
2440 	net_qos_map_init();
2441 
2442 	/* Initialize the interface low power mode event handler */
2443 	if_low_power_evhdlr_init();
2444 
2445 	/* Initialize the interface offload port list subsystem */
2446 	if_ports_used_init();
2447 
2448 #if DEBUG || DEVELOPMENT
2449 	/* Run self-tests */
2450 	dlil_verify_sum16();
2451 #endif /* DEBUG || DEVELOPMENT */
2452 
2453 	/*
2454 	 * Create and start up the main DLIL input thread and the interface
2455 	 * detacher threads once everything is initialized.
2456 	 */
2457 	dlil_incr_pending_thread_count();
2458 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2459 
2460 	/*
2461 	 * Create ifnet detacher thread.
2462 	 * When an interface gets detached, part of the detach processing
2463 	 * is delayed. The interface is added to delayed detach list
2464 	 * and this thread is woken up to call ifnet_detach_final
2465 	 * on these interfaces.
2466 	 */
2467 	dlil_incr_pending_thread_count();
2468 	if (kernel_thread_start(ifnet_detacher_thread_func,
2469 	    NULL, &thread) != KERN_SUCCESS) {
2470 		panic_plain("%s: couldn't create detacher thread", __func__);
2471 		/* NOTREACHED */
2472 	}
2473 	thread_deallocate(thread);
2474 
2475 	/*
2476 	 * Wait for the created kernel threads for dlil to get
2477 	 * scheduled and run at least once before we proceed
2478 	 */
2479 	lck_mtx_lock(&dlil_thread_sync_lock);
2480 	while (dlil_pending_thread_cnt != 0) {
2481 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2482 		    "threads to get scheduled at least once.\n", __func__);
2483 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2484 		    (PZERO - 1), __func__, NULL);
2485 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2486 	}
2487 	lck_mtx_unlock(&dlil_thread_sync_lock);
2488 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2489 	    "scheduled at least once. Proceeding.\n", __func__);
2490 }
2491 
2492 static void
if_flt_monitor_busy(struct ifnet * ifp)2493 if_flt_monitor_busy(struct ifnet *ifp)
2494 {
2495 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2496 
2497 	++ifp->if_flt_busy;
2498 	VERIFY(ifp->if_flt_busy != 0);
2499 }
2500 
2501 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2502 if_flt_monitor_unbusy(struct ifnet *ifp)
2503 {
2504 	if_flt_monitor_leave(ifp);
2505 }
2506 
2507 static void
if_flt_monitor_enter(struct ifnet * ifp)2508 if_flt_monitor_enter(struct ifnet *ifp)
2509 {
2510 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2511 
2512 	while (ifp->if_flt_busy) {
2513 		++ifp->if_flt_waiters;
2514 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2515 		    (PZERO - 1), "if_flt_monitor", NULL);
2516 	}
2517 	if_flt_monitor_busy(ifp);
2518 }
2519 
2520 static void
if_flt_monitor_leave(struct ifnet * ifp)2521 if_flt_monitor_leave(struct ifnet *ifp)
2522 {
2523 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2524 
2525 	VERIFY(ifp->if_flt_busy != 0);
2526 	--ifp->if_flt_busy;
2527 
2528 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2529 		ifp->if_flt_waiters = 0;
2530 		wakeup(&ifp->if_flt_head);
2531 	}
2532 }
2533 
2534 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2535 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2536     interface_filter_t *filter_ref, u_int32_t flags)
2537 {
2538 	int retval = 0;
2539 	struct ifnet_filter *filter = NULL;
2540 
2541 	ifnet_head_lock_shared();
2542 
2543 	/* Check that the interface is in the global list */
2544 	if (!ifnet_lookup(ifp)) {
2545 		retval = ENXIO;
2546 		goto done;
2547 	}
2548 	if (!ifnet_is_attached(ifp, 1)) {
2549 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2550 		    __func__, if_name(ifp));
2551 		retval = ENXIO;
2552 		goto done;
2553 	}
2554 
2555 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2556 
2557 	/* refcnt held above during lookup */
2558 	filter->filt_flags = flags;
2559 	filter->filt_ifp = ifp;
2560 	filter->filt_cookie = if_filter->iff_cookie;
2561 	filter->filt_name = if_filter->iff_name;
2562 	filter->filt_protocol = if_filter->iff_protocol;
2563 	/*
2564 	 * Do not install filter callbacks for internal coproc interface
2565 	 * and for management interfaces
2566 	 */
2567 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2568 		filter->filt_input = if_filter->iff_input;
2569 		filter->filt_output = if_filter->iff_output;
2570 		filter->filt_event = if_filter->iff_event;
2571 		filter->filt_ioctl = if_filter->iff_ioctl;
2572 	}
2573 	filter->filt_detached = if_filter->iff_detached;
2574 
2575 	lck_mtx_lock(&ifp->if_flt_lock);
2576 	if_flt_monitor_enter(ifp);
2577 
2578 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2579 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2580 
2581 	*filter_ref = filter;
2582 
2583 	/*
2584 	 * Bump filter count and route_generation ID to let TCP
2585 	 * know it shouldn't do TSO on this connection
2586 	 */
2587 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2588 		ifnet_filter_update_tso(ifp, TRUE);
2589 	}
2590 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2591 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2592 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2593 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2594 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2595 	} else {
2596 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2597 	}
2598 	if_flt_monitor_leave(ifp);
2599 	lck_mtx_unlock(&ifp->if_flt_lock);
2600 
2601 #if SKYWALK
2602 	if (kernel_is_macos_or_server()) {
2603 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2604 		    net_check_compatible_if_filter(NULL));
2605 	}
2606 #endif /* SKYWALK */
2607 
2608 	if (dlil_verbose) {
2609 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2610 		    if_filter->iff_name);
2611 	}
2612 	ifnet_decr_iorefcnt(ifp);
2613 
2614 done:
2615 	ifnet_head_done();
2616 	if (retval != 0 && ifp != NULL) {
2617 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2618 		    if_name(ifp), if_filter->iff_name, retval);
2619 	}
2620 	if (retval != 0 && filter != NULL) {
2621 		zfree(dlif_filt_zone, filter);
2622 	}
2623 
2624 	return retval;
2625 }
2626 
2627 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2628 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2629 {
2630 	int retval = 0;
2631 
2632 	if (detached == 0) {
2633 		ifnet_t ifp = NULL;
2634 
2635 		ifnet_head_lock_shared();
2636 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2637 			interface_filter_t entry = NULL;
2638 
2639 			lck_mtx_lock(&ifp->if_flt_lock);
2640 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2641 				if (entry != filter || entry->filt_skip) {
2642 					continue;
2643 				}
2644 				/*
2645 				 * We've found a match; since it's possible
2646 				 * that the thread gets blocked in the monitor,
2647 				 * we do the lock dance.  Interface should
2648 				 * not be detached since we still have a use
2649 				 * count held during filter attach.
2650 				 */
2651 				entry->filt_skip = 1;   /* skip input/output */
2652 				lck_mtx_unlock(&ifp->if_flt_lock);
2653 				ifnet_head_done();
2654 
2655 				lck_mtx_lock(&ifp->if_flt_lock);
2656 				if_flt_monitor_enter(ifp);
2657 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2658 				    LCK_MTX_ASSERT_OWNED);
2659 
2660 				/* Remove the filter from the list */
2661 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2662 				    filt_next);
2663 
2664 				if (dlil_verbose) {
2665 					DLIL_PRINTF("%s: %s filter detached\n",
2666 					    if_name(ifp), filter->filt_name);
2667 				}
2668 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2669 					VERIFY(ifp->if_flt_non_os_count != 0);
2670 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2671 				}
2672 				/*
2673 				 * Decrease filter count and route_generation
2674 				 * ID to let TCP know it should reevalute doing
2675 				 * TSO or not.
2676 				 */
2677 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2678 					ifnet_filter_update_tso(ifp, FALSE);
2679 				}
2680 				/*
2681 				 * When we remove the bridge's interface filter,
2682 				 * clear the field in the ifnet.
2683 				 */
2684 				if ((filter->filt_flags & DLIL_IFF_BRIDGE)
2685 				    != 0) {
2686 					ifp->if_bridge = NULL;
2687 				}
2688 				if_flt_monitor_leave(ifp);
2689 				lck_mtx_unlock(&ifp->if_flt_lock);
2690 				goto destroy;
2691 			}
2692 			lck_mtx_unlock(&ifp->if_flt_lock);
2693 		}
2694 		ifnet_head_done();
2695 
2696 		/* filter parameter is not a valid filter ref */
2697 		retval = EINVAL;
2698 		goto done;
2699 	} else {
2700 		struct ifnet *ifp = filter->filt_ifp;
2701 		/*
2702 		 * Here we are called from ifnet_detach_final(); the
2703 		 * caller had emptied if_flt_head and we're doing an
2704 		 * implicit filter detach because the interface is
2705 		 * about to go away.  Make sure to adjust the counters
2706 		 * in this case.  We don't need the protection of the
2707 		 * filter monitor since we're called as part of the
2708 		 * final detach in the context of the detacher thread.
2709 		 */
2710 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2711 			VERIFY(ifp->if_flt_non_os_count != 0);
2712 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2713 		}
2714 		/*
2715 		 * Decrease filter count and route_generation
2716 		 * ID to let TCP know it should reevalute doing
2717 		 * TSO or not.
2718 		 */
2719 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2720 			ifnet_filter_update_tso(ifp, FALSE);
2721 		}
2722 	}
2723 
2724 	if (dlil_verbose) {
2725 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2726 	}
2727 
2728 destroy:
2729 
2730 	/* Call the detached function if there is one */
2731 	if (filter->filt_detached) {
2732 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2733 	}
2734 
2735 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2736 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2737 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2738 	}
2739 #if SKYWALK
2740 	if (kernel_is_macos_or_server()) {
2741 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2742 		    net_check_compatible_if_filter(NULL));
2743 	}
2744 #endif /* SKYWALK */
2745 
2746 	/* Free the filter */
2747 	zfree(dlif_filt_zone, filter);
2748 	filter = NULL;
2749 done:
2750 	if (retval != 0 && filter != NULL) {
2751 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2752 		    filter->filt_name, retval);
2753 	}
2754 
2755 	return retval;
2756 }
2757 
2758 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2759 dlil_detach_filter(interface_filter_t filter)
2760 {
2761 	if (filter == NULL) {
2762 		return;
2763 	}
2764 	dlil_detach_filter_internal(filter, 0);
2765 }
2766 
2767 __private_extern__ boolean_t
dlil_has_ip_filter(void)2768 dlil_has_ip_filter(void)
2769 {
2770 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
2771 
2772 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
2773 
2774 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2775 	return has_filter;
2776 }
2777 
2778 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2779 dlil_has_if_filter(struct ifnet *ifp)
2780 {
2781 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2782 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2783 	return has_filter;
2784 }
2785 
2786 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2787 dlil_input_wakeup(struct dlil_threading_info *inp)
2788 {
2789 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2790 
2791 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2792 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2793 		inp->dlth_wtot++;
2794 		wakeup_one((caddr_t)&inp->dlth_flags);
2795 	}
2796 }
2797 
2798 __attribute__((noreturn))
2799 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2800 dlil_main_input_thread_func(void *v, wait_result_t w)
2801 {
2802 #pragma unused(w)
2803 	struct dlil_threading_info *inp = v;
2804 
2805 	VERIFY(inp == dlil_main_input_thread);
2806 	VERIFY(inp->dlth_ifp == NULL);
2807 	VERIFY(current_thread() == inp->dlth_thread);
2808 
2809 	lck_mtx_lock(&inp->dlth_lock);
2810 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2811 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2812 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2813 	/* wake up once to get out of embryonic state */
2814 	dlil_input_wakeup(inp);
2815 	lck_mtx_unlock(&inp->dlth_lock);
2816 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2817 	/* NOTREACHED */
2818 	__builtin_unreachable();
2819 }
2820 
2821 /*
2822  * Main input thread:
2823  *
2824  *   a) handles all inbound packets for lo0
2825  *   b) handles all inbound packets for interfaces with no dedicated
2826  *	input thread (e.g. anything but Ethernet/PDP or those that support
2827  *	opportunistic polling.)
2828  *   c) protocol registrations
2829  *   d) packet injections
2830  */
2831 __attribute__((noreturn))
2832 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2833 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2834 {
2835 	struct dlil_main_threading_info *inpm = v;
2836 	struct dlil_threading_info *inp = v;
2837 
2838 	/* main input thread is uninterruptible */
2839 	VERIFY(wres != THREAD_INTERRUPTED);
2840 	lck_mtx_lock_spin(&inp->dlth_lock);
2841 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2842 	    DLIL_INPUT_RUNNING)));
2843 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2844 
2845 	while (1) {
2846 		struct mbuf *m = NULL, *m_loop = NULL;
2847 		u_int32_t m_cnt, m_cnt_loop;
2848 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2849 		boolean_t proto_req;
2850 		boolean_t embryonic;
2851 
2852 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2853 
2854 		if (__improbable(embryonic =
2855 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2856 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2857 		}
2858 
2859 		proto_req = (inp->dlth_flags &
2860 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2861 
2862 		/* Packets for non-dedicated interfaces other than lo0 */
2863 		m_cnt = qlen(&inp->dlth_pkts);
2864 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2865 		m = pkt.cp_mbuf;
2866 
2867 		/* Packets exclusive to lo0 */
2868 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2869 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2870 		m_loop = pkt.cp_mbuf;
2871 
2872 		inp->dlth_wtot = 0;
2873 
2874 		lck_mtx_unlock(&inp->dlth_lock);
2875 
2876 		if (__improbable(embryonic)) {
2877 			dlil_decr_pending_thread_count();
2878 		}
2879 
2880 		/*
2881 		 * NOTE warning %%% attention !!!!
2882 		 * We should think about putting some thread starvation
2883 		 * safeguards if we deal with long chains of packets.
2884 		 */
2885 		if (__probable(m_loop != NULL)) {
2886 			dlil_input_packet_list_extended(lo_ifp, m_loop,
2887 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2888 		}
2889 
2890 		if (__probable(m != NULL)) {
2891 			dlil_input_packet_list_extended(NULL, m,
2892 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2893 		}
2894 
2895 		if (__improbable(proto_req)) {
2896 			proto_input_run();
2897 		}
2898 
2899 		lck_mtx_lock_spin(&inp->dlth_lock);
2900 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2901 		/* main input thread cannot be terminated */
2902 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2903 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2904 			break;
2905 		}
2906 	}
2907 
2908 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
2909 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2910 	lck_mtx_unlock(&inp->dlth_lock);
2911 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2912 
2913 	VERIFY(0);      /* we should never get here */
2914 	/* NOTREACHED */
2915 	__builtin_unreachable();
2916 }
2917 
2918 /*
2919  * Input thread for interfaces with legacy input model.
2920  */
2921 __attribute__((noreturn))
2922 static void
dlil_input_thread_func(void * v,wait_result_t w)2923 dlil_input_thread_func(void *v, wait_result_t w)
2924 {
2925 #pragma unused(w)
2926 	char thread_name[MAXTHREADNAMESIZE];
2927 	struct dlil_threading_info *inp = v;
2928 	struct ifnet *ifp = inp->dlth_ifp;
2929 
2930 	VERIFY(inp != dlil_main_input_thread);
2931 	VERIFY(ifp != NULL);
2932 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
2933 	    !(ifp->if_xflags & IFXF_LEGACY));
2934 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
2935 	    !(ifp->if_xflags & IFXF_LEGACY));
2936 	VERIFY(current_thread() == inp->dlth_thread);
2937 
2938 	/* construct the name for this thread, and then apply it */
2939 	bzero(thread_name, sizeof(thread_name));
2940 	(void) snprintf(thread_name, sizeof(thread_name),
2941 	    "dlil_input_%s", ifp->if_xname);
2942 	thread_set_thread_name(inp->dlth_thread, thread_name);
2943 
2944 	lck_mtx_lock(&inp->dlth_lock);
2945 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2946 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2947 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2948 	/* wake up once to get out of embryonic state */
2949 	dlil_input_wakeup(inp);
2950 	lck_mtx_unlock(&inp->dlth_lock);
2951 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
2952 	/* NOTREACHED */
2953 	__builtin_unreachable();
2954 }
2955 
2956 __attribute__((noreturn))
2957 static void
dlil_input_thread_cont(void * v,wait_result_t wres)2958 dlil_input_thread_cont(void *v, wait_result_t wres)
2959 {
2960 	struct dlil_threading_info *inp = v;
2961 	struct ifnet *ifp = inp->dlth_ifp;
2962 
2963 	lck_mtx_lock_spin(&inp->dlth_lock);
2964 	if (__improbable(wres == THREAD_INTERRUPTED ||
2965 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
2966 		goto terminate;
2967 	}
2968 
2969 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
2970 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2971 
2972 	while (1) {
2973 		struct mbuf *m = NULL;
2974 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2975 		boolean_t notify = FALSE;
2976 		boolean_t embryonic;
2977 		u_int32_t m_cnt;
2978 
2979 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2980 
2981 		if (__improbable(embryonic =
2982 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2983 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2984 		}
2985 
2986 		/*
2987 		 * Protocol registration and injection must always use
2988 		 * the main input thread; in theory the latter can utilize
2989 		 * the corresponding input thread where the packet arrived
2990 		 * on, but that requires our knowing the interface in advance
2991 		 * (and the benefits might not worth the trouble.)
2992 		 */
2993 		VERIFY(!(inp->dlth_flags &
2994 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
2995 
2996 		/* Packets for this interface */
2997 		m_cnt = qlen(&inp->dlth_pkts);
2998 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2999 		m = pkt.cp_mbuf;
3000 
3001 		inp->dlth_wtot = 0;
3002 
3003 #if SKYWALK
3004 		/*
3005 		 * If this interface is attached to a netif nexus,
3006 		 * the stats are already incremented there; otherwise
3007 		 * do it here.
3008 		 */
3009 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3010 #endif /* SKYWALK */
3011 		notify = dlil_input_stats_sync(ifp, inp);
3012 
3013 		lck_mtx_unlock(&inp->dlth_lock);
3014 
3015 		if (__improbable(embryonic)) {
3016 			ifnet_decr_pending_thread_count(ifp);
3017 		}
3018 
3019 		if (__improbable(notify)) {
3020 			ifnet_notify_data_threshold(ifp);
3021 		}
3022 
3023 		/*
3024 		 * NOTE warning %%% attention !!!!
3025 		 * We should think about putting some thread starvation
3026 		 * safeguards if we deal with long chains of packets.
3027 		 */
3028 		if (__probable(m != NULL)) {
3029 			dlil_input_packet_list_extended(ifp, m,
3030 			    m_cnt, ifp->if_poll_mode);
3031 		}
3032 
3033 		lck_mtx_lock_spin(&inp->dlth_lock);
3034 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3035 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3036 		    DLIL_INPUT_TERMINATE))) {
3037 			break;
3038 		}
3039 	}
3040 
3041 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3042 
3043 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3044 terminate:
3045 		lck_mtx_unlock(&inp->dlth_lock);
3046 		dlil_terminate_input_thread(inp);
3047 		/* NOTREACHED */
3048 	} else {
3049 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3050 		lck_mtx_unlock(&inp->dlth_lock);
3051 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3052 		/* NOTREACHED */
3053 	}
3054 
3055 	VERIFY(0);      /* we should never get here */
3056 	/* NOTREACHED */
3057 	__builtin_unreachable();
3058 }
3059 
3060 /*
3061  * Input thread for interfaces with opportunistic polling input model.
3062  */
3063 __attribute__((noreturn))
3064 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3065 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3066 {
3067 #pragma unused(w)
3068 	char thread_name[MAXTHREADNAMESIZE];
3069 	struct dlil_threading_info *inp = v;
3070 	struct ifnet *ifp = inp->dlth_ifp;
3071 
3072 	VERIFY(inp != dlil_main_input_thread);
3073 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3074 	    (ifp->if_xflags & IFXF_LEGACY));
3075 	VERIFY(current_thread() == inp->dlth_thread);
3076 
3077 	/* construct the name for this thread, and then apply it */
3078 	bzero(thread_name, sizeof(thread_name));
3079 	(void) snprintf(thread_name, sizeof(thread_name),
3080 	    "dlil_input_poll_%s", ifp->if_xname);
3081 	thread_set_thread_name(inp->dlth_thread, thread_name);
3082 
3083 	lck_mtx_lock(&inp->dlth_lock);
3084 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3085 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3086 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3087 	/* wake up once to get out of embryonic state */
3088 	dlil_input_wakeup(inp);
3089 	lck_mtx_unlock(&inp->dlth_lock);
3090 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3091 	/* NOTREACHED */
3092 	__builtin_unreachable();
3093 }
3094 
3095 __attribute__((noreturn))
3096 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3097 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3098 {
3099 	struct dlil_threading_info *inp = v;
3100 	struct ifnet *ifp = inp->dlth_ifp;
3101 	struct timespec ts;
3102 
3103 	lck_mtx_lock_spin(&inp->dlth_lock);
3104 	if (__improbable(wres == THREAD_INTERRUPTED ||
3105 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3106 		goto terminate;
3107 	}
3108 
3109 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3110 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3111 
3112 	while (1) {
3113 		struct mbuf *m = NULL;
3114 		uint32_t m_cnt, poll_req = 0;
3115 		uint64_t m_size = 0;
3116 		ifnet_model_t mode;
3117 		struct timespec now, delta;
3118 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3119 		boolean_t notify;
3120 		boolean_t embryonic;
3121 		uint64_t ival;
3122 
3123 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3124 
3125 		if (__improbable(embryonic =
3126 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3127 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3128 			goto skip;
3129 		}
3130 
3131 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3132 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3133 		}
3134 
3135 		/* Link parameters changed? */
3136 		if (ifp->if_poll_update != 0) {
3137 			ifp->if_poll_update = 0;
3138 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3139 		}
3140 
3141 		/* Current operating mode */
3142 		mode = ifp->if_poll_mode;
3143 
3144 		/*
3145 		 * Protocol registration and injection must always use
3146 		 * the main input thread; in theory the latter can utilize
3147 		 * the corresponding input thread where the packet arrived
3148 		 * on, but that requires our knowing the interface in advance
3149 		 * (and the benefits might not worth the trouble.)
3150 		 */
3151 		VERIFY(!(inp->dlth_flags &
3152 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3153 
3154 		/* Total count of all packets */
3155 		m_cnt = qlen(&inp->dlth_pkts);
3156 
3157 		/* Total bytes of all packets */
3158 		m_size = qsize(&inp->dlth_pkts);
3159 
3160 		/* Packets for this interface */
3161 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3162 		m = pkt.cp_mbuf;
3163 		VERIFY(m != NULL || m_cnt == 0);
3164 
3165 		nanouptime(&now);
3166 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3167 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3168 		}
3169 
3170 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3171 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3172 			u_int32_t ptot, btot;
3173 
3174 			/* Accumulate statistics for current sampling */
3175 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3176 
3177 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3178 				goto skip;
3179 			}
3180 
3181 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3182 
3183 			/* Calculate min/max of inbound bytes */
3184 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3185 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3186 				ifp->if_rxpoll_bmin = btot;
3187 			}
3188 			if (btot > ifp->if_rxpoll_bmax) {
3189 				ifp->if_rxpoll_bmax = btot;
3190 			}
3191 
3192 			/* Calculate EWMA of inbound bytes */
3193 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3194 
3195 			/* Calculate min/max of inbound packets */
3196 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3197 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3198 				ifp->if_rxpoll_pmin = ptot;
3199 			}
3200 			if (ptot > ifp->if_rxpoll_pmax) {
3201 				ifp->if_rxpoll_pmax = ptot;
3202 			}
3203 
3204 			/* Calculate EWMA of inbound packets */
3205 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3206 
3207 			/* Reset sampling statistics */
3208 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3209 
3210 			/* Calculate EWMA of wakeup requests */
3211 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3212 			    if_rxpoll_decay);
3213 			inp->dlth_wtot = 0;
3214 
3215 			if (dlil_verbose) {
3216 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3217 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3218 				}
3219 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3220 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3221 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3222 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3223 					    "limits [%d/%d], wreq avg %d "
3224 					    "limits [%d/%d], bytes avg %d "
3225 					    "limits [%d/%d]\n", if_name(ifp),
3226 					    (ifp->if_poll_mode ==
3227 					    IFNET_MODEL_INPUT_POLL_ON) ?
3228 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3229 					    ifp->if_rxpoll_pmax,
3230 					    ifp->if_rxpoll_plowat,
3231 					    ifp->if_rxpoll_phiwat,
3232 					    ifp->if_rxpoll_wavg,
3233 					    ifp->if_rxpoll_wlowat,
3234 					    ifp->if_rxpoll_whiwat,
3235 					    ifp->if_rxpoll_bavg,
3236 					    ifp->if_rxpoll_blowat,
3237 					    ifp->if_rxpoll_bhiwat);
3238 				}
3239 			}
3240 
3241 			/* Perform mode transition, if necessary */
3242 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3243 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3244 			}
3245 
3246 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3247 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3248 				goto skip;
3249 			}
3250 
3251 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3252 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3253 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3254 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3255 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3256 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3257 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3258 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3259 				mode = IFNET_MODEL_INPUT_POLL_ON;
3260 			}
3261 
3262 			if (mode != ifp->if_poll_mode) {
3263 				ifp->if_poll_mode = mode;
3264 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3265 				poll_req++;
3266 			}
3267 		}
3268 skip:
3269 		notify = dlil_input_stats_sync(ifp, inp);
3270 
3271 		lck_mtx_unlock(&inp->dlth_lock);
3272 
3273 		if (__improbable(embryonic)) {
3274 			ifnet_decr_pending_thread_count(ifp);
3275 		}
3276 
3277 		if (__improbable(notify)) {
3278 			ifnet_notify_data_threshold(ifp);
3279 		}
3280 
3281 		/*
3282 		 * If there's a mode change and interface is still attached,
3283 		 * perform a downcall to the driver for the new mode.  Also
3284 		 * hold an IO refcnt on the interface to prevent it from
3285 		 * being detached (will be release below.)
3286 		 */
3287 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3288 			struct ifnet_model_params p = {
3289 				.model = mode, .reserved = { 0 }
3290 			};
3291 			errno_t err;
3292 
3293 			if (dlil_verbose) {
3294 				DLIL_PRINTF("%s: polling is now %s, "
3295 				    "pkts avg %d max %d limits [%d/%d], "
3296 				    "wreq avg %d limits [%d/%d], "
3297 				    "bytes avg %d limits [%d/%d]\n",
3298 				    if_name(ifp),
3299 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3300 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3301 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3302 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3303 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3304 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3305 				    ifp->if_rxpoll_bhiwat);
3306 			}
3307 
3308 			if ((err = ((*ifp->if_input_ctl)(ifp,
3309 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3310 				DLIL_PRINTF("%s: error setting polling mode "
3311 				    "to %s (%d)\n", if_name(ifp),
3312 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3313 				    "ON" : "OFF", err);
3314 			}
3315 
3316 			switch (mode) {
3317 			case IFNET_MODEL_INPUT_POLL_OFF:
3318 				ifnet_set_poll_cycle(ifp, NULL);
3319 				ifp->if_rxpoll_offreq++;
3320 				if (err != 0) {
3321 					ifp->if_rxpoll_offerr++;
3322 				}
3323 				break;
3324 
3325 			case IFNET_MODEL_INPUT_POLL_ON:
3326 				net_nsectimer(&ival, &ts);
3327 				ifnet_set_poll_cycle(ifp, &ts);
3328 				ifnet_poll(ifp);
3329 				ifp->if_rxpoll_onreq++;
3330 				if (err != 0) {
3331 					ifp->if_rxpoll_onerr++;
3332 				}
3333 				break;
3334 
3335 			default:
3336 				VERIFY(0);
3337 				/* NOTREACHED */
3338 			}
3339 
3340 			/* Release the IO refcnt */
3341 			ifnet_decr_iorefcnt(ifp);
3342 		}
3343 
3344 		/*
3345 		 * NOTE warning %%% attention !!!!
3346 		 * We should think about putting some thread starvation
3347 		 * safeguards if we deal with long chains of packets.
3348 		 */
3349 		if (__probable(m != NULL)) {
3350 			dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
3351 		}
3352 
3353 		lck_mtx_lock_spin(&inp->dlth_lock);
3354 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3355 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3356 		    DLIL_INPUT_TERMINATE))) {
3357 			break;
3358 		}
3359 	}
3360 
3361 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3362 
3363 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3364 terminate:
3365 		lck_mtx_unlock(&inp->dlth_lock);
3366 		dlil_terminate_input_thread(inp);
3367 		/* NOTREACHED */
3368 	} else {
3369 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3370 		lck_mtx_unlock(&inp->dlth_lock);
3371 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3372 		    inp);
3373 		/* NOTREACHED */
3374 	}
3375 
3376 	VERIFY(0);      /* we should never get here */
3377 	/* NOTREACHED */
3378 	__builtin_unreachable();
3379 }
3380 
3381 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3382 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3383 {
3384 	if (p != NULL) {
3385 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3386 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3387 			return EINVAL;
3388 		}
3389 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3390 		    p->packets_lowat >= p->packets_hiwat) {
3391 			return EINVAL;
3392 		}
3393 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3394 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3395 			return EINVAL;
3396 		}
3397 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3398 		    p->bytes_lowat >= p->bytes_hiwat) {
3399 			return EINVAL;
3400 		}
3401 		if (p->interval_time != 0 &&
3402 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3403 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3404 		}
3405 	}
3406 	return 0;
3407 }
3408 
3409 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3410 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3411 {
3412 	u_int64_t sample_holdtime, inbw;
3413 
3414 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3415 		sample_holdtime = 0;    /* polling is disabled */
3416 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3417 		    ifp->if_rxpoll_blowat = 0;
3418 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3419 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3420 		ifp->if_rxpoll_plim = 0;
3421 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3422 	} else {
3423 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3424 		u_int64_t ival;
3425 		unsigned int n, i;
3426 
3427 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3428 			if (inbw < rxpoll_tbl[i].speed) {
3429 				break;
3430 			}
3431 			n = i;
3432 		}
3433 		/* auto-tune if caller didn't specify a value */
3434 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3435 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3436 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3437 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3438 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3439 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3440 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3441 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3442 		plim = ((p == NULL || p->packets_limit == 0 ||
3443 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3444 		ival = ((p == NULL || p->interval_time == 0 ||
3445 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3446 		    if_rxpoll_interval_time : p->interval_time);
3447 
3448 		VERIFY(plowat != 0 && phiwat != 0);
3449 		VERIFY(blowat != 0 && bhiwat != 0);
3450 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3451 
3452 		sample_holdtime = if_rxpoll_sample_holdtime;
3453 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3454 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3455 		ifp->if_rxpoll_plowat = plowat;
3456 		ifp->if_rxpoll_phiwat = phiwat;
3457 		ifp->if_rxpoll_blowat = blowat;
3458 		ifp->if_rxpoll_bhiwat = bhiwat;
3459 		ifp->if_rxpoll_plim = plim;
3460 		ifp->if_rxpoll_ival = ival;
3461 	}
3462 
3463 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3464 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3465 
3466 	if (dlil_verbose) {
3467 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3468 		    "poll interval %llu nsec, pkts per poll %u, "
3469 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3470 		    "bytes limits [%u/%u]\n", if_name(ifp),
3471 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3472 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3473 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3474 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3475 		    ifp->if_rxpoll_bhiwat);
3476 	}
3477 }
3478 
3479 /*
3480  * Must be called on an attached ifnet (caller is expected to check.)
3481  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3482  */
3483 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3484 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3485     boolean_t locked)
3486 {
3487 	errno_t err;
3488 	struct dlil_threading_info *inp;
3489 
3490 	VERIFY(ifp != NULL);
3491 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3492 		return ENXIO;
3493 	}
3494 	err = dlil_rxpoll_validate_params(p);
3495 	if (err != 0) {
3496 		return err;
3497 	}
3498 
3499 	if (!locked) {
3500 		lck_mtx_lock(&inp->dlth_lock);
3501 	}
3502 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3503 	/*
3504 	 * Normally, we'd reset the parameters to the auto-tuned values
3505 	 * if the the input thread detects a change in link rate.  If the
3506 	 * driver provides its own parameters right after a link rate
3507 	 * changes, but before the input thread gets to run, we want to
3508 	 * make sure to keep the driver's values.  Clearing if_poll_update
3509 	 * will achieve that.
3510 	 */
3511 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3512 		ifp->if_poll_update = 0;
3513 	}
3514 	dlil_rxpoll_update_params(ifp, p);
3515 	if (!locked) {
3516 		lck_mtx_unlock(&inp->dlth_lock);
3517 	}
3518 	return 0;
3519 }
3520 
3521 /*
3522  * Must be called on an attached ifnet (caller is expected to check.)
3523  */
3524 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3525 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3526 {
3527 	struct dlil_threading_info *inp;
3528 
3529 	VERIFY(ifp != NULL && p != NULL);
3530 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3531 		return ENXIO;
3532 	}
3533 
3534 	bzero(p, sizeof(*p));
3535 
3536 	lck_mtx_lock(&inp->dlth_lock);
3537 	p->packets_limit = ifp->if_rxpoll_plim;
3538 	p->packets_lowat = ifp->if_rxpoll_plowat;
3539 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3540 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3541 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3542 	p->interval_time = ifp->if_rxpoll_ival;
3543 	lck_mtx_unlock(&inp->dlth_lock);
3544 
3545 	return 0;
3546 }
3547 
3548 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3549 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3550     const struct ifnet_stat_increment_param *s)
3551 {
3552 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3553 }
3554 
3555 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3556 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3557     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3558 {
3559 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3560 }
3561 
3562 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3563 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3564     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3565 {
3566 	return ifnet_input_common(ifp, m_head, m_tail, s,
3567 	           (m_head != NULL), TRUE);
3568 }
3569 
3570 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3571 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3572     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3573 {
3574 	dlil_input_func input_func;
3575 	struct ifnet_stat_increment_param _s;
3576 	u_int32_t m_cnt = 0, m_size = 0;
3577 	struct mbuf *last;
3578 	errno_t err = 0;
3579 
3580 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3581 		if (m_head != NULL) {
3582 			mbuf_freem_list(m_head);
3583 		}
3584 		return EINVAL;
3585 	}
3586 
3587 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3588 	VERIFY(m_tail == NULL || ext);
3589 	VERIFY(s != NULL || !ext);
3590 
3591 	/*
3592 	 * Drop the packet(s) if the parameters are invalid, or if the
3593 	 * interface is no longer attached; else hold an IO refcnt to
3594 	 * prevent it from being detached (will be released below.)
3595 	 */
3596 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3597 		if (m_head != NULL) {
3598 			mbuf_freem_list(m_head);
3599 		}
3600 		return EINVAL;
3601 	}
3602 
3603 	input_func = ifp->if_input_dlil;
3604 	VERIFY(input_func != NULL);
3605 
3606 	if (m_tail == NULL) {
3607 		last = m_head;
3608 		while (m_head != NULL) {
3609 #if IFNET_INPUT_SANITY_CHK
3610 			if (__improbable(dlil_input_sanity_check != 0)) {
3611 				DLIL_INPUT_CHECK(last, ifp);
3612 			}
3613 #endif /* IFNET_INPUT_SANITY_CHK */
3614 			m_cnt++;
3615 			m_size += m_length(last);
3616 			if (mbuf_nextpkt(last) == NULL) {
3617 				break;
3618 			}
3619 			last = mbuf_nextpkt(last);
3620 		}
3621 		m_tail = last;
3622 	} else {
3623 #if IFNET_INPUT_SANITY_CHK
3624 		if (__improbable(dlil_input_sanity_check != 0)) {
3625 			last = m_head;
3626 			while (1) {
3627 				DLIL_INPUT_CHECK(last, ifp);
3628 				m_cnt++;
3629 				m_size += m_length(last);
3630 				if (mbuf_nextpkt(last) == NULL) {
3631 					break;
3632 				}
3633 				last = mbuf_nextpkt(last);
3634 			}
3635 		} else {
3636 			m_cnt = s->packets_in;
3637 			m_size = s->bytes_in;
3638 			last = m_tail;
3639 		}
3640 #else
3641 		m_cnt = s->packets_in;
3642 		m_size = s->bytes_in;
3643 		last = m_tail;
3644 #endif /* IFNET_INPUT_SANITY_CHK */
3645 	}
3646 
3647 	if (last != m_tail) {
3648 		panic_plain("%s: invalid input packet chain for %s, "
3649 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3650 		    m_tail, last);
3651 	}
3652 
3653 	/*
3654 	 * Assert packet count only for the extended variant, for backwards
3655 	 * compatibility, since this came directly from the device driver.
3656 	 * Relax this assertion for input bytes, as the driver may have
3657 	 * included the link-layer headers in the computation; hence
3658 	 * m_size is just an approximation.
3659 	 */
3660 	if (ext && s->packets_in != m_cnt) {
3661 		panic_plain("%s: input packet count mismatch for %s, "
3662 		    "%d instead of %d\n", __func__, if_name(ifp),
3663 		    s->packets_in, m_cnt);
3664 	}
3665 
3666 	if (s == NULL) {
3667 		bzero(&_s, sizeof(_s));
3668 		s = &_s;
3669 	} else {
3670 		_s = *s;
3671 	}
3672 	_s.packets_in = m_cnt;
3673 	_s.bytes_in = m_size;
3674 
3675 	if (ifp->if_xflags & IFXF_DISABLE_INPUT) {
3676 		m_freem_list(m_head);
3677 
3678 		os_atomic_add(&ifp->if_data.ifi_ipackets, _s.packets_in, relaxed);
3679 		os_atomic_add(&ifp->if_data.ifi_ibytes, _s.bytes_in, relaxed);
3680 
3681 		goto done;
3682 	}
3683 
3684 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3685 
3686 done:
3687 	if (ifp != lo_ifp) {
3688 		/* Release the IO refcnt */
3689 		ifnet_datamov_end(ifp);
3690 	}
3691 
3692 	return err;
3693 }
3694 
3695 #if SKYWALK
3696 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3697 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3698 {
3699 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3700 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3701 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3702 }
3703 
3704 void
dlil_reset_input_handler(struct ifnet * ifp)3705 dlil_reset_input_handler(struct ifnet *ifp)
3706 {
3707 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3708 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3709 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
3710 		;
3711 	}
3712 }
3713 
3714 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3715 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3716 {
3717 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3718 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3719 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3720 }
3721 
3722 void
dlil_reset_output_handler(struct ifnet * ifp)3723 dlil_reset_output_handler(struct ifnet *ifp)
3724 {
3725 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3726 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3727 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
3728 		;
3729 	}
3730 }
3731 #endif /* SKYWALK */
3732 
3733 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3734 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3735 {
3736 	return ifp->if_output(ifp, m);
3737 }
3738 
3739 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3740 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3741     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3742     boolean_t poll, struct thread *tp)
3743 {
3744 	struct dlil_threading_info *inp = ifp->if_inp;
3745 
3746 	if (__improbable(inp == NULL)) {
3747 		inp = dlil_main_input_thread;
3748 	}
3749 
3750 #if (DEVELOPMENT || DEBUG)
3751 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3752 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3753 	} else
3754 #endif /* (DEVELOPMENT || DEBUG) */
3755 	{
3756 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3757 	}
3758 }
3759 
3760 /*
3761  * Detect whether a queue contains a burst that needs to be trimmed.
3762  */
3763 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
3764 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
3765 	                        qtype(q) == QP_MBUF)
3766 
3767 #define MAX_KNOWN_MBUF_CLASS 8
3768 
3769 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)3770 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
3771     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
3772 {
3773 	uint32_t overcommitted_qlen;    /* Length in packets. */
3774 	uint64_t overcommitted_qsize;   /* Size in bytes. */
3775 	uint32_t target_qlen;           /* The desired queue length after trimming. */
3776 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
3777 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
3778 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
3779 	struct mbuf *m = NULL, *m_tmp = NULL;
3780 
3781 	overcommitted_qlen = qlen(input_queue);
3782 	overcommitted_qsize = qsize(input_queue);
3783 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
3784 
3785 	if (overcommitted_qlen <= target_qlen) {
3786 		/*
3787 		 * The queue is already within the target limits.
3788 		 */
3789 		dropped_pkts = 0;
3790 		goto out;
3791 	}
3792 
3793 	pkts_to_drop = overcommitted_qlen - target_qlen;
3794 
3795 	/*
3796 	 * Proceed to removing packets from the head of the queue,
3797 	 * starting from the oldest, until the desired number of packets
3798 	 * has been dropped.
3799 	 */
3800 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
3801 		if (pkts_to_drop <= dropped_pkts) {
3802 			break;
3803 		}
3804 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
3805 		MBUFQ_NEXT(m) = NULL;
3806 		MBUFQ_ENQUEUE(freeq, m);
3807 
3808 		dropped_pkts += 1;
3809 		dropped_bytes += m_length(m);
3810 	}
3811 
3812 	/*
3813 	 * Adjust the length and the estimated size of the queue
3814 	 * after trimming.
3815 	 */
3816 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
3817 	qlen(input_queue) = target_qlen;
3818 
3819 	/* qsize() is an approximation. */
3820 	if (dropped_bytes < qsize(input_queue)) {
3821 		qsize(input_queue) -= dropped_bytes;
3822 	} else {
3823 		qsize(input_queue) = 0;
3824 	}
3825 
3826 	/*
3827 	 * Adjust the ifnet statistics increments, if needed.
3828 	 */
3829 	stat_delta->dropped += dropped_pkts;
3830 	if (dropped_pkts < stat_delta->packets_in) {
3831 		stat_delta->packets_in -= dropped_pkts;
3832 	} else {
3833 		stat_delta->packets_in = 0;
3834 	}
3835 	if (dropped_bytes < stat_delta->bytes_in) {
3836 		stat_delta->bytes_in -= dropped_bytes;
3837 	} else {
3838 		stat_delta->bytes_in = 0;
3839 	}
3840 
3841 out:
3842 	if (dlil_verbose) {
3843 		/*
3844 		 * The basic information about the drop is logged
3845 		 * by the invoking function (dlil_input_{,a}sync).
3846 		 * If `dlil_verbose' flag is set, provide more information
3847 		 * that can be useful for debugging.
3848 		 */
3849 		DLIL_PRINTF("%s: "
3850 		    "qlen: %u -> %u, "
3851 		    "qsize: %llu -> %llu "
3852 		    "qlimit: %u (sysctl: %u) "
3853 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
3854 		    "dropped_pkts: %u dropped_bytes %u\n",
3855 		    __func__,
3856 		    overcommitted_qlen, qlen(input_queue),
3857 		    overcommitted_qsize, qsize(input_queue),
3858 		    qlimit(input_queue), if_rcvq_burst_limit,
3859 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
3860 		    dropped_pkts, dropped_bytes);
3861 	}
3862 
3863 	return dropped_pkts;
3864 }
3865 
3866 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3867 dlil_input_async(struct dlil_threading_info *inp,
3868     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3869     const struct ifnet_stat_increment_param *s, boolean_t poll,
3870     struct thread *tp)
3871 {
3872 	u_int32_t m_cnt = s->packets_in;
3873 	u_int32_t m_size = s->bytes_in;
3874 	boolean_t notify = FALSE;
3875 	struct ifnet_stat_increment_param s_adj = *s;
3876 	dlil_freeq_t freeq;
3877 	MBUFQ_INIT(&freeq);
3878 
3879 	/*
3880 	 * If there is a matching DLIL input thread associated with an
3881 	 * affinity set, associate this thread with the same set.  We
3882 	 * will only do this once.
3883 	 */
3884 	lck_mtx_lock_spin(&inp->dlth_lock);
3885 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3886 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3887 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3888 		u_int32_t tag = inp->dlth_affinity_tag;
3889 
3890 		if (poll) {
3891 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3892 			inp->dlth_poller_thread = tp;
3893 		} else {
3894 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3895 			inp->dlth_driver_thread = tp;
3896 		}
3897 		lck_mtx_unlock(&inp->dlth_lock);
3898 
3899 		/* Associate the current thread with the new affinity tag */
3900 		(void) dlil_affinity_set(tp, tag);
3901 
3902 		/*
3903 		 * Take a reference on the current thread; during detach,
3904 		 * we will need to refer to it in order to tear down its
3905 		 * affinity.
3906 		 */
3907 		thread_reference(tp);
3908 		lck_mtx_lock_spin(&inp->dlth_lock);
3909 	}
3910 
3911 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3912 
3913 	/*
3914 	 * Because of loopbacked multicast we cannot stuff the ifp in
3915 	 * the rcvif of the packet header: loopback (lo0) packets use a
3916 	 * dedicated list so that we can later associate them with lo_ifp
3917 	 * on their way up the stack.  Packets for other interfaces without
3918 	 * dedicated input threads go to the regular list.
3919 	 */
3920 	if (m_head != NULL) {
3921 		classq_pkt_t head, tail;
3922 		class_queue_t *input_queue;
3923 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
3924 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3925 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3926 			struct dlil_main_threading_info *inpm =
3927 			    (struct dlil_main_threading_info *)inp;
3928 			input_queue = &inpm->lo_rcvq_pkts;
3929 		} else {
3930 			input_queue = &inp->dlth_pkts;
3931 		}
3932 
3933 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
3934 
3935 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
3936 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
3937 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
3938 			inp->dlth_trim_cnt += 1;
3939 
3940 			os_log_error(OS_LOG_DEFAULT,
3941 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
3942 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
3943 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
3944 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
3945 			    qlen(input_queue));
3946 		}
3947 	}
3948 
3949 #if IFNET_INPUT_SANITY_CHK
3950 	/*
3951 	 * Verify that the original stat increment parameter
3952 	 * accurately describes the input chain `m_head`.
3953 	 * This is not affected by the trimming of input queue.
3954 	 */
3955 	if (__improbable(dlil_input_sanity_check != 0)) {
3956 		u_int32_t count = 0, size = 0;
3957 		struct mbuf *m0;
3958 
3959 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3960 			size += m_length(m0);
3961 			count++;
3962 		}
3963 
3964 		if (count != m_cnt) {
3965 			panic_plain("%s: invalid total packet count %u "
3966 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3967 			/* NOTREACHED */
3968 			__builtin_unreachable();
3969 		} else if (size != m_size) {
3970 			panic_plain("%s: invalid total packet size %u "
3971 			    "(expected %u)\n", if_name(ifp), size, m_size);
3972 			/* NOTREACHED */
3973 			__builtin_unreachable();
3974 		}
3975 
3976 		inp->dlth_pkts_cnt += m_cnt;
3977 	}
3978 #endif /* IFNET_INPUT_SANITY_CHK */
3979 
3980 	/* NOTE: use the adjusted parameter, vs the original one */
3981 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
3982 	/*
3983 	 * If we're using the main input thread, synchronize the
3984 	 * stats now since we have the interface context.  All
3985 	 * other cases involving dedicated input threads will
3986 	 * have their stats synchronized there.
3987 	 */
3988 	if (inp == dlil_main_input_thread) {
3989 		notify = dlil_input_stats_sync(ifp, inp);
3990 	}
3991 
3992 	dlil_input_wakeup(inp);
3993 	lck_mtx_unlock(&inp->dlth_lock);
3994 
3995 	/*
3996 	 * Actual freeing of the excess packets must happen
3997 	 * after the dlth_lock had been released.
3998 	 */
3999 	if (!MBUFQ_EMPTY(&freeq)) {
4000 		m_freem_list(MBUFQ_FIRST(&freeq));
4001 	}
4002 
4003 	if (notify) {
4004 		ifnet_notify_data_threshold(ifp);
4005 	}
4006 
4007 	return 0;
4008 }
4009 
4010 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4011 dlil_input_sync(struct dlil_threading_info *inp,
4012     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4013     const struct ifnet_stat_increment_param *s, boolean_t poll,
4014     struct thread *tp)
4015 {
4016 #pragma unused(tp)
4017 	u_int32_t m_cnt = s->packets_in;
4018 	u_int32_t m_size = s->bytes_in;
4019 	boolean_t notify = FALSE;
4020 	classq_pkt_t head, tail;
4021 	struct ifnet_stat_increment_param s_adj = *s;
4022 	dlil_freeq_t freeq;
4023 	MBUFQ_INIT(&freeq);
4024 
4025 	ASSERT(inp != dlil_main_input_thread);
4026 
4027 	/* XXX: should we just assert instead? */
4028 	if (__improbable(m_head == NULL)) {
4029 		return 0;
4030 	}
4031 
4032 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4033 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4034 
4035 	lck_mtx_lock_spin(&inp->dlth_lock);
4036 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4037 
4038 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4039 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4040 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4041 		inp->dlth_trim_cnt += 1;
4042 
4043 		os_log_error(OS_LOG_DEFAULT,
4044 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4045 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4046 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4047 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4048 		    qlen(&inp->dlth_pkts));
4049 	}
4050 
4051 #if IFNET_INPUT_SANITY_CHK
4052 	if (__improbable(dlil_input_sanity_check != 0)) {
4053 		u_int32_t count = 0, size = 0;
4054 		struct mbuf *m0;
4055 
4056 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4057 			size += m_length(m0);
4058 			count++;
4059 		}
4060 
4061 		if (count != m_cnt) {
4062 			panic_plain("%s: invalid total packet count %u "
4063 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4064 			/* NOTREACHED */
4065 			__builtin_unreachable();
4066 		} else if (size != m_size) {
4067 			panic_plain("%s: invalid total packet size %u "
4068 			    "(expected %u)\n", if_name(ifp), size, m_size);
4069 			/* NOTREACHED */
4070 			__builtin_unreachable();
4071 		}
4072 
4073 		inp->dlth_pkts_cnt += m_cnt;
4074 	}
4075 #endif /* IFNET_INPUT_SANITY_CHK */
4076 
4077 	/* NOTE: use the adjusted parameter, vs the original one */
4078 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4079 
4080 	m_cnt = qlen(&inp->dlth_pkts);
4081 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4082 
4083 #if SKYWALK
4084 	/*
4085 	 * If this interface is attached to a netif nexus,
4086 	 * the stats are already incremented there; otherwise
4087 	 * do it here.
4088 	 */
4089 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4090 #endif /* SKYWALK */
4091 	notify = dlil_input_stats_sync(ifp, inp);
4092 
4093 	lck_mtx_unlock(&inp->dlth_lock);
4094 
4095 	/*
4096 	 * Actual freeing of the excess packets must happen
4097 	 * after the dlth_lock had been released.
4098 	 */
4099 	if (!MBUFQ_EMPTY(&freeq)) {
4100 		m_freem_list(MBUFQ_FIRST(&freeq));
4101 	}
4102 
4103 	if (notify) {
4104 		ifnet_notify_data_threshold(ifp);
4105 	}
4106 
4107 	/*
4108 	 * NOTE warning %%% attention !!!!
4109 	 * We should think about putting some thread starvation
4110 	 * safeguards if we deal with long chains of packets.
4111 	 */
4112 	if (head.cp_mbuf != NULL) {
4113 		dlil_input_packet_list_extended(ifp, head.cp_mbuf,
4114 		    m_cnt, ifp->if_poll_mode);
4115 	}
4116 
4117 	return 0;
4118 }
4119 
4120 #if SKYWALK
4121 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4122 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4123 {
4124 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4125 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4126 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4127 }
4128 
4129 void
ifnet_reset_output_handler(struct ifnet * ifp)4130 ifnet_reset_output_handler(struct ifnet *ifp)
4131 {
4132 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4133 	    ptrauth_nop_cast(void *, ifp->if_output),
4134 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4135 		;
4136 	}
4137 }
4138 
4139 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4140 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4141 {
4142 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4143 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4144 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4145 }
4146 
4147 void
ifnet_reset_start_handler(struct ifnet * ifp)4148 ifnet_reset_start_handler(struct ifnet *ifp)
4149 {
4150 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4151 	    ptrauth_nop_cast(void *, ifp->if_start),
4152 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4153 		;
4154 	}
4155 }
4156 #endif /* SKYWALK */
4157 
4158 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4159 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4160 {
4161 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4162 		return;
4163 	}
4164 	/*
4165 	 * If the starter thread is inactive, signal it to do work,
4166 	 * unless the interface is being flow controlled from below,
4167 	 * e.g. a virtual interface being flow controlled by a real
4168 	 * network interface beneath it, or it's been disabled via
4169 	 * a call to ifnet_disable_output().
4170 	 */
4171 	lck_mtx_lock_spin(&ifp->if_start_lock);
4172 	if (ignore_delay) {
4173 		ifp->if_start_flags |= IFSF_NO_DELAY;
4174 	}
4175 	if (resetfc) {
4176 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4177 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4178 		lck_mtx_unlock(&ifp->if_start_lock);
4179 		return;
4180 	}
4181 	ifp->if_start_req++;
4182 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4183 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4184 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4185 	    ifp->if_start_delayed == 0)) {
4186 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4187 	}
4188 	lck_mtx_unlock(&ifp->if_start_lock);
4189 }
4190 
4191 void
ifnet_start(struct ifnet * ifp)4192 ifnet_start(struct ifnet *ifp)
4193 {
4194 	ifnet_start_common(ifp, FALSE, FALSE);
4195 }
4196 
4197 void
ifnet_start_ignore_delay(struct ifnet * ifp)4198 ifnet_start_ignore_delay(struct ifnet *ifp)
4199 {
4200 	ifnet_start_common(ifp, FALSE, TRUE);
4201 }
4202 
4203 __attribute__((noreturn))
4204 static void
ifnet_start_thread_func(void * v,wait_result_t w)4205 ifnet_start_thread_func(void *v, wait_result_t w)
4206 {
4207 #pragma unused(w)
4208 	struct ifnet *ifp = v;
4209 	char thread_name[MAXTHREADNAMESIZE];
4210 
4211 	/* Construct the name for this thread, and then apply it. */
4212 	bzero(thread_name, sizeof(thread_name));
4213 	(void) snprintf(thread_name, sizeof(thread_name),
4214 	    "ifnet_start_%s", ifp->if_xname);
4215 #if SKYWALK
4216 	/* override name for native Skywalk interface */
4217 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4218 		(void) snprintf(thread_name, sizeof(thread_name),
4219 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4220 	}
4221 #endif /* SKYWALK */
4222 	ASSERT(ifp->if_start_thread == current_thread());
4223 	thread_set_thread_name(current_thread(), thread_name);
4224 
4225 	/*
4226 	 * Treat the dedicated starter thread for lo0 as equivalent to
4227 	 * the driver workloop thread; if net_affinity is enabled for
4228 	 * the main input thread, associate this starter thread to it
4229 	 * by binding them with the same affinity tag.  This is done
4230 	 * only once (as we only have one lo_ifp which never goes away.)
4231 	 */
4232 	if (ifp == lo_ifp) {
4233 		struct dlil_threading_info *inp = dlil_main_input_thread;
4234 		struct thread *tp = current_thread();
4235 #if SKYWALK
4236 		/* native skywalk loopback not yet implemented */
4237 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4238 #endif /* SKYWALK */
4239 
4240 		lck_mtx_lock(&inp->dlth_lock);
4241 		if (inp->dlth_affinity) {
4242 			u_int32_t tag = inp->dlth_affinity_tag;
4243 
4244 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4245 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4246 			inp->dlth_driver_thread = tp;
4247 			lck_mtx_unlock(&inp->dlth_lock);
4248 
4249 			/* Associate this thread with the affinity tag */
4250 			(void) dlil_affinity_set(tp, tag);
4251 		} else {
4252 			lck_mtx_unlock(&inp->dlth_lock);
4253 		}
4254 	}
4255 
4256 	lck_mtx_lock(&ifp->if_start_lock);
4257 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4258 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4259 	ifp->if_start_embryonic = 1;
4260 	/* wake up once to get out of embryonic state */
4261 	ifp->if_start_req++;
4262 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4263 	lck_mtx_unlock(&ifp->if_start_lock);
4264 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4265 	/* NOTREACHED */
4266 	__builtin_unreachable();
4267 }
4268 
4269 __attribute__((noreturn))
4270 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4271 ifnet_start_thread_cont(void *v, wait_result_t wres)
4272 {
4273 	struct ifnet *ifp = v;
4274 	struct ifclassq *ifq = ifp->if_snd;
4275 
4276 	lck_mtx_lock_spin(&ifp->if_start_lock);
4277 	if (__improbable(wres == THREAD_INTERRUPTED ||
4278 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4279 		goto terminate;
4280 	}
4281 
4282 	if (__improbable(ifp->if_start_embryonic)) {
4283 		ifp->if_start_embryonic = 0;
4284 		lck_mtx_unlock(&ifp->if_start_lock);
4285 		ifnet_decr_pending_thread_count(ifp);
4286 		lck_mtx_lock_spin(&ifp->if_start_lock);
4287 		goto skip;
4288 	}
4289 
4290 	ifp->if_start_active = 1;
4291 
4292 	/*
4293 	 * Keep on servicing until no more request.
4294 	 */
4295 	for (;;) {
4296 		u_int32_t req = ifp->if_start_req;
4297 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4298 		    !IFCQ_IS_EMPTY(ifq) &&
4299 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4300 		    ifp->if_start_delayed == 0 &&
4301 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4302 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4303 			ifp->if_start_delayed = 1;
4304 			ifnet_start_delayed++;
4305 			break;
4306 		}
4307 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4308 		ifp->if_start_delayed = 0;
4309 		lck_mtx_unlock(&ifp->if_start_lock);
4310 
4311 		/*
4312 		 * If no longer attached, don't call start because ifp
4313 		 * is being destroyed; else hold an IO refcnt to
4314 		 * prevent the interface from being detached (will be
4315 		 * released below.)
4316 		 */
4317 		if (!ifnet_datamov_begin(ifp)) {
4318 			lck_mtx_lock_spin(&ifp->if_start_lock);
4319 			break;
4320 		}
4321 
4322 		/* invoke the driver's start routine */
4323 		((*ifp->if_start)(ifp));
4324 
4325 		/*
4326 		 * Release the io ref count taken above.
4327 		 */
4328 		ifnet_datamov_end(ifp);
4329 
4330 		lck_mtx_lock_spin(&ifp->if_start_lock);
4331 
4332 		/*
4333 		 * If there's no pending request or if the
4334 		 * interface has been disabled, we're done.
4335 		 */
4336 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4337 		if (req == ifp->if_start_req ||
4338 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4339 			break;
4340 		}
4341 	}
4342 skip:
4343 	ifp->if_start_req = 0;
4344 	ifp->if_start_active = 0;
4345 
4346 #if SKYWALK
4347 	/*
4348 	 * Wakeup any waiters, e.g. any threads waiting to
4349 	 * detach the interface from the flowswitch, etc.
4350 	 */
4351 	if (ifp->if_start_waiters != 0) {
4352 		ifp->if_start_waiters = 0;
4353 		wakeup(&ifp->if_start_waiters);
4354 	}
4355 #endif /* SKYWALK */
4356 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4357 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4358 		struct timespec delay_start_ts;
4359 		struct timespec *ts = NULL;
4360 
4361 		if (ts == NULL) {
4362 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4363 			    &ifp->if_start_cycle : NULL);
4364 		}
4365 
4366 		if (ts == NULL && ifp->if_start_delayed == 1) {
4367 			delay_start_ts.tv_sec = 0;
4368 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4369 			ts = &delay_start_ts;
4370 		}
4371 
4372 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4373 			ts = NULL;
4374 		}
4375 
4376 		if (__improbable(ts != NULL)) {
4377 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4378 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4379 		}
4380 
4381 		(void) assert_wait_deadline(&ifp->if_start_thread,
4382 		    THREAD_UNINT, deadline);
4383 		lck_mtx_unlock(&ifp->if_start_lock);
4384 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4385 		/* NOTREACHED */
4386 	} else {
4387 terminate:
4388 		/* interface is detached? */
4389 		ifnet_set_start_cycle(ifp, NULL);
4390 
4391 		/* clear if_start_thread to allow termination to continue */
4392 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4393 		ifp->if_start_thread = THREAD_NULL;
4394 		wakeup((caddr_t)&ifp->if_start_thread);
4395 		lck_mtx_unlock(&ifp->if_start_lock);
4396 
4397 		if (dlil_verbose) {
4398 			DLIL_PRINTF("%s: starter thread terminated\n",
4399 			    if_name(ifp));
4400 		}
4401 
4402 		/* for the extra refcnt from kernel_thread_start() */
4403 		thread_deallocate(current_thread());
4404 		/* this is the end */
4405 		thread_terminate(current_thread());
4406 		/* NOTREACHED */
4407 	}
4408 
4409 	/* must never get here */
4410 	VERIFY(0);
4411 	/* NOTREACHED */
4412 	__builtin_unreachable();
4413 }
4414 
4415 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4416 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4417 {
4418 	if (ts == NULL) {
4419 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4420 	} else {
4421 		*(&ifp->if_start_cycle) = *ts;
4422 	}
4423 
4424 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4425 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4426 		    if_name(ifp), ts->tv_nsec);
4427 	}
4428 }
4429 
4430 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4431 ifnet_poll_wakeup(struct ifnet *ifp)
4432 {
4433 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4434 
4435 	ifp->if_poll_req++;
4436 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4437 	    ifp->if_poll_thread != THREAD_NULL) {
4438 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4439 	}
4440 }
4441 
4442 void
ifnet_poll(struct ifnet * ifp)4443 ifnet_poll(struct ifnet *ifp)
4444 {
4445 	/*
4446 	 * If the poller thread is inactive, signal it to do work.
4447 	 */
4448 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4449 	ifnet_poll_wakeup(ifp);
4450 	lck_mtx_unlock(&ifp->if_poll_lock);
4451 }
4452 
4453 __attribute__((noreturn))
4454 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4455 ifnet_poll_thread_func(void *v, wait_result_t w)
4456 {
4457 #pragma unused(w)
4458 	char thread_name[MAXTHREADNAMESIZE];
4459 	struct ifnet *ifp = v;
4460 
4461 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4462 	VERIFY(current_thread() == ifp->if_poll_thread);
4463 
4464 	/* construct the name for this thread, and then apply it */
4465 	bzero(thread_name, sizeof(thread_name));
4466 	(void) snprintf(thread_name, sizeof(thread_name),
4467 	    "ifnet_poller_%s", ifp->if_xname);
4468 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4469 
4470 	lck_mtx_lock(&ifp->if_poll_lock);
4471 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4472 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4473 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4474 	/* wake up once to get out of embryonic state */
4475 	ifnet_poll_wakeup(ifp);
4476 	lck_mtx_unlock(&ifp->if_poll_lock);
4477 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4478 	/* NOTREACHED */
4479 	__builtin_unreachable();
4480 }
4481 
4482 __attribute__((noreturn))
4483 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4484 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4485 {
4486 	struct dlil_threading_info *inp;
4487 	struct ifnet *ifp = v;
4488 	struct ifnet_stat_increment_param s;
4489 	struct timespec start_time;
4490 
4491 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4492 
4493 	bzero(&s, sizeof(s));
4494 	net_timerclear(&start_time);
4495 
4496 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4497 	if (__improbable(wres == THREAD_INTERRUPTED ||
4498 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4499 		goto terminate;
4500 	}
4501 
4502 	inp = ifp->if_inp;
4503 	VERIFY(inp != NULL);
4504 
4505 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4506 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4507 		lck_mtx_unlock(&ifp->if_poll_lock);
4508 		ifnet_decr_pending_thread_count(ifp);
4509 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4510 		goto skip;
4511 	}
4512 
4513 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4514 
4515 	/*
4516 	 * Keep on servicing until no more request.
4517 	 */
4518 	for (;;) {
4519 		struct mbuf *m_head, *m_tail;
4520 		u_int32_t m_lim, m_cnt, m_totlen;
4521 		u_int16_t req = ifp->if_poll_req;
4522 
4523 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4524 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4525 		lck_mtx_unlock(&ifp->if_poll_lock);
4526 
4527 		/*
4528 		 * If no longer attached, there's nothing to do;
4529 		 * else hold an IO refcnt to prevent the interface
4530 		 * from being detached (will be released below.)
4531 		 */
4532 		if (!ifnet_is_attached(ifp, 1)) {
4533 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4534 			break;
4535 		}
4536 
4537 		if (dlil_verbose > 1) {
4538 			DLIL_PRINTF("%s: polling up to %d pkts, "
4539 			    "pkts avg %d max %d, wreq avg %d, "
4540 			    "bytes avg %d\n",
4541 			    if_name(ifp), m_lim,
4542 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4543 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4544 		}
4545 
4546 		/* invoke the driver's input poll routine */
4547 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4548 		&m_cnt, &m_totlen));
4549 
4550 		if (m_head != NULL) {
4551 			VERIFY(m_tail != NULL && m_cnt > 0);
4552 
4553 			if (dlil_verbose > 1) {
4554 				DLIL_PRINTF("%s: polled %d pkts, "
4555 				    "pkts avg %d max %d, wreq avg %d, "
4556 				    "bytes avg %d\n",
4557 				    if_name(ifp), m_cnt,
4558 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4559 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4560 			}
4561 
4562 			/* stats are required for extended variant */
4563 			s.packets_in = m_cnt;
4564 			s.bytes_in = m_totlen;
4565 
4566 			(void) ifnet_input_common(ifp, m_head, m_tail,
4567 			    &s, TRUE, TRUE);
4568 		} else {
4569 			if (dlil_verbose > 1) {
4570 				DLIL_PRINTF("%s: no packets, "
4571 				    "pkts avg %d max %d, wreq avg %d, "
4572 				    "bytes avg %d\n",
4573 				    if_name(ifp), ifp->if_rxpoll_pavg,
4574 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4575 				    ifp->if_rxpoll_bavg);
4576 			}
4577 
4578 			(void) ifnet_input_common(ifp, NULL, NULL,
4579 			    NULL, FALSE, TRUE);
4580 		}
4581 
4582 		/* Release the io ref count */
4583 		ifnet_decr_iorefcnt(ifp);
4584 
4585 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4586 
4587 		/* if there's no pending request, we're done */
4588 		if (req == ifp->if_poll_req ||
4589 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4590 			break;
4591 		}
4592 	}
4593 skip:
4594 	ifp->if_poll_req = 0;
4595 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4596 
4597 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4598 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4599 		struct timespec *ts;
4600 
4601 		/*
4602 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4603 		 * until ifnet_poll() is called again.
4604 		 */
4605 		ts = &ifp->if_poll_cycle;
4606 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4607 			ts = NULL;
4608 		}
4609 
4610 		if (ts != NULL) {
4611 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4612 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4613 		}
4614 
4615 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4616 		    THREAD_UNINT, deadline);
4617 		lck_mtx_unlock(&ifp->if_poll_lock);
4618 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4619 		/* NOTREACHED */
4620 	} else {
4621 terminate:
4622 		/* interface is detached (maybe while asleep)? */
4623 		ifnet_set_poll_cycle(ifp, NULL);
4624 
4625 		/* clear if_poll_thread to allow termination to continue */
4626 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4627 		ifp->if_poll_thread = THREAD_NULL;
4628 		wakeup((caddr_t)&ifp->if_poll_thread);
4629 		lck_mtx_unlock(&ifp->if_poll_lock);
4630 
4631 		if (dlil_verbose) {
4632 			DLIL_PRINTF("%s: poller thread terminated\n",
4633 			    if_name(ifp));
4634 		}
4635 
4636 		/* for the extra refcnt from kernel_thread_start() */
4637 		thread_deallocate(current_thread());
4638 		/* this is the end */
4639 		thread_terminate(current_thread());
4640 		/* NOTREACHED */
4641 	}
4642 
4643 	/* must never get here */
4644 	VERIFY(0);
4645 	/* NOTREACHED */
4646 	__builtin_unreachable();
4647 }
4648 
4649 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4650 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4651 {
4652 	if (ts == NULL) {
4653 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4654 	} else {
4655 		*(&ifp->if_poll_cycle) = *ts;
4656 	}
4657 
4658 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4659 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4660 		    if_name(ifp), ts->tv_nsec);
4661 	}
4662 }
4663 
4664 void
ifnet_purge(struct ifnet * ifp)4665 ifnet_purge(struct ifnet *ifp)
4666 {
4667 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4668 		if_qflush_snd(ifp, false);
4669 	}
4670 }
4671 
4672 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4673 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4674 {
4675 	IFCQ_LOCK_ASSERT_HELD(ifq);
4676 
4677 	if (!(IFCQ_IS_READY(ifq))) {
4678 		return;
4679 	}
4680 
4681 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4682 		struct tb_profile tb = {
4683 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4684 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4685 		};
4686 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4687 	}
4688 
4689 	ifclassq_update(ifq, ev);
4690 }
4691 
4692 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4693 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4694 {
4695 	switch (ev) {
4696 	case CLASSQ_EV_LINK_BANDWIDTH:
4697 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4698 			ifp->if_poll_update++;
4699 		}
4700 		break;
4701 
4702 	default:
4703 		break;
4704 	}
4705 }
4706 
4707 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4708 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4709 {
4710 	struct ifclassq *ifq;
4711 	u_int32_t omodel;
4712 	errno_t err;
4713 
4714 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4715 		return EINVAL;
4716 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4717 		return ENXIO;
4718 	}
4719 
4720 	ifq = ifp->if_snd;
4721 	IFCQ_LOCK(ifq);
4722 	omodel = ifp->if_output_sched_model;
4723 	ifp->if_output_sched_model = model;
4724 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4725 		ifp->if_output_sched_model = omodel;
4726 	}
4727 	IFCQ_UNLOCK(ifq);
4728 
4729 	return err;
4730 }
4731 
4732 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4733 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4734 {
4735 	if (ifp == NULL) {
4736 		return EINVAL;
4737 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4738 		return ENXIO;
4739 	}
4740 
4741 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4742 
4743 	return 0;
4744 }
4745 
4746 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4747 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4748 {
4749 	if (ifp == NULL || maxqlen == NULL) {
4750 		return EINVAL;
4751 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4752 		return ENXIO;
4753 	}
4754 
4755 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4756 
4757 	return 0;
4758 }
4759 
4760 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4761 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4762 {
4763 	errno_t err;
4764 
4765 	if (ifp == NULL || pkts == NULL) {
4766 		err = EINVAL;
4767 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4768 		err = ENXIO;
4769 	} else {
4770 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4771 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4772 	}
4773 
4774 	return err;
4775 }
4776 
4777 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4778 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4779     u_int32_t *pkts, u_int32_t *bytes)
4780 {
4781 	errno_t err;
4782 
4783 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4784 	    (pkts == NULL && bytes == NULL)) {
4785 		err = EINVAL;
4786 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4787 		err = ENXIO;
4788 	} else {
4789 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4790 		    pkts, bytes);
4791 	}
4792 
4793 	return err;
4794 }
4795 
4796 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4797 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4798 {
4799 	struct dlil_threading_info *inp;
4800 
4801 	if (ifp == NULL) {
4802 		return EINVAL;
4803 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4804 		return ENXIO;
4805 	}
4806 
4807 	if (maxqlen == 0) {
4808 		maxqlen = if_rcvq_maxlen;
4809 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4810 		maxqlen = IF_RCVQ_MINLEN;
4811 	}
4812 
4813 	inp = ifp->if_inp;
4814 	lck_mtx_lock(&inp->dlth_lock);
4815 	qlimit(&inp->dlth_pkts) = maxqlen;
4816 	lck_mtx_unlock(&inp->dlth_lock);
4817 
4818 	return 0;
4819 }
4820 
4821 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4822 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4823 {
4824 	struct dlil_threading_info *inp;
4825 
4826 	if (ifp == NULL || maxqlen == NULL) {
4827 		return EINVAL;
4828 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4829 		return ENXIO;
4830 	}
4831 
4832 	inp = ifp->if_inp;
4833 	lck_mtx_lock(&inp->dlth_lock);
4834 	*maxqlen = qlimit(&inp->dlth_pkts);
4835 	lck_mtx_unlock(&inp->dlth_lock);
4836 	return 0;
4837 }
4838 
4839 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4840 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4841     uint16_t delay_timeout)
4842 {
4843 	if (delay_qlen > 0 && delay_timeout > 0) {
4844 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4845 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4846 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4847 		/* convert timeout to nanoseconds */
4848 		ifp->if_start_delay_timeout *= 1000;
4849 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4850 		    ifp->if_xname, (uint32_t)delay_qlen,
4851 		    (uint32_t)delay_timeout);
4852 	} else {
4853 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4854 	}
4855 }
4856 
4857 /*
4858  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4859  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4860  * buf holds the full header.
4861  */
4862 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4863 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4864 {
4865 	struct ip *ip;
4866 	struct ip6_hdr *ip6;
4867 	uint8_t lbuf[64] __attribute__((aligned(8)));
4868 	uint8_t *p = buf;
4869 
4870 	if (ip_ver == IPVERSION) {
4871 		uint8_t old_tos;
4872 		uint32_t sum;
4873 
4874 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4875 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4876 			bcopy(buf, lbuf, sizeof(struct ip));
4877 			p = lbuf;
4878 		}
4879 		ip = (struct ip *)(void *)p;
4880 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4881 			return;
4882 		}
4883 
4884 		DTRACE_IP1(clear__v4, struct ip *, ip);
4885 		old_tos = ip->ip_tos;
4886 		ip->ip_tos &= IPTOS_ECN_MASK;
4887 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4888 		sum = (sum >> 16) + (sum & 0xffff);
4889 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4890 
4891 		if (__improbable(p == lbuf)) {
4892 			bcopy(lbuf, buf, sizeof(struct ip));
4893 		}
4894 	} else {
4895 		uint32_t flow;
4896 		ASSERT(ip_ver == IPV6_VERSION);
4897 
4898 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4899 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4900 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4901 			p = lbuf;
4902 		}
4903 		ip6 = (struct ip6_hdr *)(void *)p;
4904 		flow = ntohl(ip6->ip6_flow);
4905 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4906 			return;
4907 		}
4908 
4909 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4910 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4911 
4912 		if (__improbable(p == lbuf)) {
4913 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4914 		}
4915 	}
4916 }
4917 
4918 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4919 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4920     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4921 {
4922 #if SKYWALK
4923 	volatile struct sk_nexusadv *nxadv = NULL;
4924 #endif /* SKYWALK */
4925 	volatile uint64_t *fg_ts = NULL;
4926 	volatile uint64_t *rt_ts = NULL;
4927 	struct timespec now;
4928 	u_int64_t now_nsec = 0;
4929 	int error = 0;
4930 	uint8_t *mcast_buf = NULL;
4931 	uint8_t ip_ver;
4932 	uint32_t pktlen;
4933 
4934 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4935 #if SKYWALK
4936 	/*
4937 	 * If attached to flowswitch, grab pointers to the
4938 	 * timestamp variables in the nexus advisory region.
4939 	 */
4940 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4941 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4942 		fg_ts = &nxadv->nxadv_fg_sendts;
4943 		rt_ts = &nxadv->nxadv_rt_sendts;
4944 	}
4945 #endif /* SKYWALK */
4946 
4947 	/*
4948 	 * If packet already carries a timestamp, either from dlil_output()
4949 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4950 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4951 	 * the timestamp value is used internally there.
4952 	 */
4953 	switch (p->cp_ptype) {
4954 	case QP_MBUF:
4955 #if SKYWALK
4956 		/*
4957 		 * Valid only for non-native (compat) Skywalk interface.
4958 		 * If the data source uses packet, caller must convert
4959 		 * it to mbuf first prior to calling this routine.
4960 		 */
4961 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4962 #endif /* SKYWALK */
4963 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4964 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4965 
4966 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4967 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4968 			nanouptime(&now);
4969 			net_timernsec(&now, &now_nsec);
4970 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4971 		}
4972 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4973 		/*
4974 		 * If the packet service class is not background,
4975 		 * update the timestamp to indicate recent activity
4976 		 * on a foreground socket.
4977 		 */
4978 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4979 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4980 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4981 			    PKTF_SO_BACKGROUND)) {
4982 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
4983 				if (fg_ts != NULL) {
4984 					*fg_ts = (uint32_t)_net_uptime;
4985 				}
4986 			}
4987 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4988 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
4989 				if (rt_ts != NULL) {
4990 					*rt_ts = (uint32_t)_net_uptime;
4991 				}
4992 			}
4993 		}
4994 		pktlen = m_pktlen(p->cp_mbuf);
4995 
4996 		/*
4997 		 * Some Wi-Fi AP implementations do not correctly handle
4998 		 * multicast IP packets with DSCP bits set (radr://9331522).
4999 		 * As a workaround we clear the DSCP bits but keep service
5000 		 * class (rdar://51507725).
5001 		 */
5002 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5003 		    IFNET_IS_WIFI_INFRA(ifp)) {
5004 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5005 			struct ether_header *eh;
5006 			boolean_t pullup = FALSE;
5007 			uint16_t etype;
5008 
5009 			if (__improbable(len < sizeof(struct ether_header))) {
5010 				DTRACE_IP1(small__ether, size_t, len);
5011 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5012 				    sizeof(struct ether_header))) == NULL) {
5013 					return ENOMEM;
5014 				}
5015 			}
5016 			eh = mtod(p->cp_mbuf, struct ether_header *);
5017 			etype = ntohs(eh->ether_type);
5018 			if (etype == ETHERTYPE_IP) {
5019 				hlen = sizeof(struct ether_header) +
5020 				    sizeof(struct ip);
5021 				if (len < hlen) {
5022 					DTRACE_IP1(small__v4, size_t, len);
5023 					pullup = TRUE;
5024 				}
5025 				ip_ver = IPVERSION;
5026 			} else if (etype == ETHERTYPE_IPV6) {
5027 				hlen = sizeof(struct ether_header) +
5028 				    sizeof(struct ip6_hdr);
5029 				if (len < hlen) {
5030 					DTRACE_IP1(small__v6, size_t, len);
5031 					pullup = TRUE;
5032 				}
5033 				ip_ver = IPV6_VERSION;
5034 			} else {
5035 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5036 				break;
5037 			}
5038 			if (pullup) {
5039 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5040 				    NULL) {
5041 					return ENOMEM;
5042 				}
5043 
5044 				eh = mtod(p->cp_mbuf, struct ether_header *);
5045 			}
5046 			mcast_buf = (uint8_t *)(eh + 1);
5047 			/*
5048 			 * ifnet_mcast_clear_dscp() will finish the work below.
5049 			 * Note that the pullups above ensure that mcast_buf
5050 			 * points to a full IP header.
5051 			 */
5052 		}
5053 		break;
5054 
5055 #if SKYWALK
5056 	case QP_PACKET:
5057 		/*
5058 		 * Valid only for native Skywalk interface.  If the data
5059 		 * source uses mbuf, caller must convert it to packet first
5060 		 * prior to calling this routine.
5061 		 */
5062 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5063 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5064 		    p->cp_kpkt->pkt_timestamp == 0) {
5065 			nanouptime(&now);
5066 			net_timernsec(&now, &now_nsec);
5067 			p->cp_kpkt->pkt_timestamp = now_nsec;
5068 		}
5069 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5070 		/*
5071 		 * If the packet service class is not background,
5072 		 * update the timestamps on the interface, as well as
5073 		 * the ones in nexus-wide advisory to indicate recent
5074 		 * activity on a foreground flow.
5075 		 */
5076 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5077 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5078 			if (fg_ts != NULL) {
5079 				*fg_ts = (uint32_t)_net_uptime;
5080 			}
5081 		}
5082 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5083 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5084 			if (rt_ts != NULL) {
5085 				*rt_ts = (uint32_t)_net_uptime;
5086 			}
5087 		}
5088 		pktlen = p->cp_kpkt->pkt_length;
5089 
5090 		/*
5091 		 * Some Wi-Fi AP implementations do not correctly handle
5092 		 * multicast IP packets with DSCP bits set (radr://9331522).
5093 		 * As a workaround we clear the DSCP bits but keep service
5094 		 * class (rdar://51507725).
5095 		 */
5096 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5097 		    IFNET_IS_WIFI_INFRA(ifp)) {
5098 			uint8_t *baddr;
5099 			struct ether_header *eh;
5100 			uint16_t etype;
5101 
5102 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5103 			baddr += p->cp_kpkt->pkt_headroom;
5104 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5105 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5106 				    p->cp_kpkt);
5107 				break;
5108 			}
5109 			eh = (struct ether_header *)(void *)baddr;
5110 			etype = ntohs(eh->ether_type);
5111 			if (etype == ETHERTYPE_IP) {
5112 				if (pktlen < sizeof(struct ether_header) +
5113 				    sizeof(struct ip)) {
5114 					DTRACE_IP1(pkt__small__v4, uint32_t,
5115 					    pktlen);
5116 					break;
5117 				}
5118 				ip_ver = IPVERSION;
5119 			} else if (etype == ETHERTYPE_IPV6) {
5120 				if (pktlen < sizeof(struct ether_header) +
5121 				    sizeof(struct ip6_hdr)) {
5122 					DTRACE_IP1(pkt__small__v6, uint32_t,
5123 					    pktlen);
5124 					break;
5125 				}
5126 				ip_ver = IPV6_VERSION;
5127 			} else {
5128 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5129 				    etype);
5130 				break;
5131 			}
5132 			mcast_buf = (uint8_t *)(eh + 1);
5133 			/*
5134 			 * ifnet_mcast_clear_dscp() will finish the work below.
5135 			 * The checks above verify that the IP header is in the
5136 			 * first buflet.
5137 			 */
5138 		}
5139 		break;
5140 #endif /* SKYWALK */
5141 
5142 	default:
5143 		VERIFY(0);
5144 		/* NOTREACHED */
5145 		__builtin_unreachable();
5146 	}
5147 
5148 	if (mcast_buf != NULL) {
5149 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5150 	}
5151 
5152 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5153 		if (now_nsec == 0) {
5154 			nanouptime(&now);
5155 			net_timernsec(&now, &now_nsec);
5156 		}
5157 		/*
5158 		 * If the driver chose to delay start callback for
5159 		 * coalescing multiple packets, Then use the following
5160 		 * heuristics to make sure that start callback will
5161 		 * be delayed only when bulk data transfer is detected.
5162 		 * 1. number of packets enqueued in (delay_win * 2) is
5163 		 * greater than or equal to the delay qlen.
5164 		 * 2. If delay_start is enabled it will stay enabled for
5165 		 * another 10 idle windows. This is to take into account
5166 		 * variable RTT and burst traffic.
5167 		 * 3. If the time elapsed since last enqueue is more
5168 		 * than 200ms we disable delaying start callback. This is
5169 		 * is to take idle time into account.
5170 		 */
5171 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5172 		if (ifp->if_start_delay_swin > 0) {
5173 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5174 				ifp->if_start_delay_cnt++;
5175 			} else if ((now_nsec - ifp->if_start_delay_swin)
5176 			    >= (200 * 1000 * 1000)) {
5177 				ifp->if_start_delay_swin = now_nsec;
5178 				ifp->if_start_delay_cnt = 1;
5179 				ifp->if_start_delay_idle = 0;
5180 				if (ifp->if_eflags & IFEF_DELAY_START) {
5181 					if_clear_eflags(ifp, IFEF_DELAY_START);
5182 					ifnet_delay_start_disabled_increment();
5183 				}
5184 			} else {
5185 				if (ifp->if_start_delay_cnt >=
5186 				    ifp->if_start_delay_qlen) {
5187 					if_set_eflags(ifp, IFEF_DELAY_START);
5188 					ifp->if_start_delay_idle = 0;
5189 				} else {
5190 					if (ifp->if_start_delay_idle >= 10) {
5191 						if_clear_eflags(ifp,
5192 						    IFEF_DELAY_START);
5193 						ifnet_delay_start_disabled_increment();
5194 					} else {
5195 						ifp->if_start_delay_idle++;
5196 					}
5197 				}
5198 				ifp->if_start_delay_swin = now_nsec;
5199 				ifp->if_start_delay_cnt = 1;
5200 			}
5201 		} else {
5202 			ifp->if_start_delay_swin = now_nsec;
5203 			ifp->if_start_delay_cnt = 1;
5204 			ifp->if_start_delay_idle = 0;
5205 			if_clear_eflags(ifp, IFEF_DELAY_START);
5206 		}
5207 	} else {
5208 		if_clear_eflags(ifp, IFEF_DELAY_START);
5209 	}
5210 
5211 	/* enqueue the packet (caller consumes object) */
5212 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5213 	    1, pktlen, pdrop);
5214 
5215 	/*
5216 	 * Tell the driver to start dequeueing; do this even when the queue
5217 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5218 	 * be dequeueing from other unsuspended queues.
5219 	 */
5220 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5221 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5222 		ifnet_start(ifp);
5223 	}
5224 
5225 	return error;
5226 }
5227 
5228 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5229 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5230     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5231     boolean_t flush, boolean_t *pdrop)
5232 {
5233 	int error;
5234 
5235 	/* enqueue the packet (caller consumes object) */
5236 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5237 	    cnt, bytes, pdrop);
5238 
5239 	/*
5240 	 * Tell the driver to start dequeueing; do this even when the queue
5241 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5242 	 * be dequeueing from other unsuspended queues.
5243 	 */
5244 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5245 		ifnet_start(ifp);
5246 	}
5247 	return error;
5248 }
5249 
5250 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5251 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5252 {
5253 	struct ifnet *ifp = handle;
5254 	boolean_t pdrop;        /* dummy */
5255 	uint32_t i;
5256 
5257 	ASSERT(n_pkts >= 1);
5258 	for (i = 0; i < n_pkts - 1; i++) {
5259 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5260 		    FALSE, &pdrop);
5261 	}
5262 	/* flush with the last packet */
5263 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5264 	    TRUE, &pdrop);
5265 
5266 	return 0;
5267 }
5268 
5269 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5270 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5271     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5272 {
5273 	if (ifp->if_output_netem != NULL) {
5274 		bool drop;
5275 		errno_t error;
5276 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5277 		*pdrop = drop ? TRUE : FALSE;
5278 		return error;
5279 	} else {
5280 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5281 	}
5282 }
5283 
5284 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5285 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5286 {
5287 	uint32_t bytes = m_pktlen(m);
5288 	struct mbuf *tail = m;
5289 	uint32_t cnt = 1;
5290 	boolean_t pdrop;
5291 
5292 	while (tail->m_nextpkt) {
5293 		VERIFY(tail->m_flags & M_PKTHDR);
5294 		tail = tail->m_nextpkt;
5295 		cnt++;
5296 		bytes += m_pktlen(tail);
5297 	}
5298 
5299 	return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5300 }
5301 
5302 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5303 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5304     boolean_t *pdrop)
5305 {
5306 	classq_pkt_t pkt;
5307 
5308 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5309 	    m->m_nextpkt != NULL) {
5310 		if (m != NULL) {
5311 			m_freem_list(m);
5312 			*pdrop = TRUE;
5313 		}
5314 		return EINVAL;
5315 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5316 	    !IF_FULLY_ATTACHED(ifp)) {
5317 		/* flag tested without lock for performance */
5318 		m_freem(m);
5319 		*pdrop = TRUE;
5320 		return ENXIO;
5321 	} else if (!(ifp->if_flags & IFF_UP)) {
5322 		m_freem(m);
5323 		*pdrop = TRUE;
5324 		return ENETDOWN;
5325 	}
5326 
5327 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5328 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5329 }
5330 
5331 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5332 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5333     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5334     boolean_t *pdrop)
5335 {
5336 	classq_pkt_t head, tail;
5337 
5338 	ASSERT(m_head != NULL);
5339 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5340 	ASSERT(m_tail != NULL);
5341 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5342 	ASSERT(ifp != NULL);
5343 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5344 
5345 	if (!IF_FULLY_ATTACHED(ifp)) {
5346 		/* flag tested without lock for performance */
5347 		m_freem_list(m_head);
5348 		*pdrop = TRUE;
5349 		return ENXIO;
5350 	} else if (!(ifp->if_flags & IFF_UP)) {
5351 		m_freem_list(m_head);
5352 		*pdrop = TRUE;
5353 		return ENETDOWN;
5354 	}
5355 
5356 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5357 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5358 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5359 	           flush, pdrop);
5360 }
5361 
5362 #if SKYWALK
5363 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5364 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5365     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5366 {
5367 	classq_pkt_t pkt;
5368 
5369 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5370 
5371 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5372 		if (kpkt != NULL) {
5373 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5374 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5375 			*pdrop = TRUE;
5376 		}
5377 		return EINVAL;
5378 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5379 	    !IF_FULLY_ATTACHED(ifp))) {
5380 		/* flag tested without lock for performance */
5381 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5382 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5383 		*pdrop = TRUE;
5384 		return ENXIO;
5385 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5386 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5387 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5388 		*pdrop = TRUE;
5389 		return ENETDOWN;
5390 	}
5391 
5392 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5393 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5394 }
5395 
5396 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5397 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5398     boolean_t flush, boolean_t *pdrop)
5399 {
5400 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5401 }
5402 
5403 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5404 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5405     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5406 {
5407 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5408 }
5409 
5410 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5411 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5412     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5413     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5414 {
5415 	classq_pkt_t head, tail;
5416 
5417 	ASSERT(k_head != NULL);
5418 	ASSERT(k_tail != NULL);
5419 	ASSERT(ifp != NULL);
5420 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5421 
5422 	if (!IF_FULLY_ATTACHED(ifp)) {
5423 		/* flag tested without lock for performance */
5424 		pp_free_packet_chain(k_head, NULL);
5425 		*pdrop = TRUE;
5426 		return ENXIO;
5427 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5428 		pp_free_packet_chain(k_head, NULL);
5429 		*pdrop = TRUE;
5430 		return ENETDOWN;
5431 	}
5432 
5433 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5434 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5435 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5436 	           flush, pdrop);
5437 }
5438 
5439 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5440 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5441     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5442     boolean_t *pdrop)
5443 {
5444 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5445 	           cnt, bytes, flush, pdrop);
5446 }
5447 
5448 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5449 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5450     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5451     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5452 {
5453 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5454 	           cnt, bytes, flush, pdrop);
5455 }
5456 #endif /* SKYWALK */
5457 
5458 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5459 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5460 {
5461 	errno_t rc;
5462 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5463 
5464 	if (ifp == NULL || mp == NULL) {
5465 		return EINVAL;
5466 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5467 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5468 		return ENXIO;
5469 	}
5470 	if (!ifnet_is_attached(ifp, 1)) {
5471 		return ENXIO;
5472 	}
5473 
5474 #if SKYWALK
5475 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5476 #endif /* SKYWALK */
5477 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5478 	    &pkt, NULL, NULL, NULL, 0);
5479 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5480 	ifnet_decr_iorefcnt(ifp);
5481 	*mp = pkt.cp_mbuf;
5482 	return rc;
5483 }
5484 
5485 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5486 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5487     struct mbuf **mp)
5488 {
5489 	errno_t rc;
5490 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5491 
5492 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5493 		return EINVAL;
5494 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5495 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5496 		return ENXIO;
5497 	}
5498 	if (!ifnet_is_attached(ifp, 1)) {
5499 		return ENXIO;
5500 	}
5501 
5502 #if SKYWALK
5503 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5504 #endif /* SKYWALK */
5505 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5506 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5507 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5508 	ifnet_decr_iorefcnt(ifp);
5509 	*mp = pkt.cp_mbuf;
5510 	return rc;
5511 }
5512 
5513 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5514 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5515     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5516 {
5517 	errno_t rc;
5518 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5519 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5520 
5521 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5522 		return EINVAL;
5523 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5524 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5525 		return ENXIO;
5526 	}
5527 	if (!ifnet_is_attached(ifp, 1)) {
5528 		return ENXIO;
5529 	}
5530 
5531 #if SKYWALK
5532 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5533 #endif /* SKYWALK */
5534 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5535 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5536 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5537 	ifnet_decr_iorefcnt(ifp);
5538 	*head = pkt_head.cp_mbuf;
5539 	if (tail != NULL) {
5540 		*tail = pkt_tail.cp_mbuf;
5541 	}
5542 	return rc;
5543 }
5544 
5545 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5546 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5547     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5548 {
5549 	errno_t rc;
5550 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5551 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5552 
5553 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5554 		return EINVAL;
5555 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5556 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5557 		return ENXIO;
5558 	}
5559 	if (!ifnet_is_attached(ifp, 1)) {
5560 		return ENXIO;
5561 	}
5562 
5563 #if SKYWALK
5564 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5565 #endif /* SKYWALK */
5566 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5567 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5568 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5569 	ifnet_decr_iorefcnt(ifp);
5570 	*head = pkt_head.cp_mbuf;
5571 	if (tail != NULL) {
5572 		*tail = pkt_tail.cp_mbuf;
5573 	}
5574 	return rc;
5575 }
5576 
5577 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5578 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5579     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5580     u_int32_t *len)
5581 {
5582 	errno_t rc;
5583 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5584 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5585 
5586 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5587 	    !MBUF_VALID_SC(sc)) {
5588 		return EINVAL;
5589 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5590 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5591 		return ENXIO;
5592 	}
5593 	if (!ifnet_is_attached(ifp, 1)) {
5594 		return ENXIO;
5595 	}
5596 
5597 #if SKYWALK
5598 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5599 #endif /* SKYWALK */
5600 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5601 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5602 	    cnt, len, 0);
5603 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5604 	ifnet_decr_iorefcnt(ifp);
5605 	*head = pkt_head.cp_mbuf;
5606 	if (tail != NULL) {
5607 		*tail = pkt_tail.cp_mbuf;
5608 	}
5609 	return rc;
5610 }
5611 
5612 #if XNU_TARGET_OS_OSX
5613 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5614 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5615     const struct sockaddr *dest, const char *dest_linkaddr,
5616     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5617 {
5618 	if (pre != NULL) {
5619 		*pre = 0;
5620 	}
5621 	if (post != NULL) {
5622 		*post = 0;
5623 	}
5624 
5625 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5626 }
5627 #endif /* XNU_TARGET_OS_OSX */
5628 
5629 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5630 packet_has_vlan_tag(struct mbuf * m)
5631 {
5632 	u_int   tag = 0;
5633 
5634 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5635 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5636 		if (tag == 0) {
5637 			/* the packet is just priority-tagged, clear the bit */
5638 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5639 		}
5640 	}
5641 	return tag != 0;
5642 }
5643 
5644 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)5645 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5646     char **frame_header_p, protocol_family_t protocol_family,
5647     boolean_t skip_bridge)
5648 {
5649 	boolean_t               is_vlan_packet = FALSE;
5650 	struct ifnet_filter     *filter;
5651 	struct mbuf             *m = *m_p;
5652 
5653 	is_vlan_packet = packet_has_vlan_tag(m);
5654 
5655 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5656 		return 0;
5657 	}
5658 
5659 	/*
5660 	 * Pass the inbound packet to the interface filters
5661 	 */
5662 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5663 	/* prevent filter list from changing in case we drop the lock */
5664 	if_flt_monitor_busy(ifp);
5665 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5666 		int result;
5667 
5668 		/* exclude VLAN packets from external filters PR-3586856 */
5669 		if (is_vlan_packet &&
5670 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5671 			continue;
5672 		}
5673 		/* the bridge has already seen the packet */
5674 		if (skip_bridge &&
5675 		    (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
5676 			continue;
5677 		}
5678 		if (!filter->filt_skip && filter->filt_input != NULL &&
5679 		    (filter->filt_protocol == 0 ||
5680 		    filter->filt_protocol == protocol_family)) {
5681 			lck_mtx_unlock(&ifp->if_flt_lock);
5682 
5683 			result = (*filter->filt_input)(filter->filt_cookie,
5684 			    ifp, protocol_family, m_p, frame_header_p);
5685 
5686 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5687 			if (result != 0) {
5688 				/* we're done with the filter list */
5689 				if_flt_monitor_unbusy(ifp);
5690 				lck_mtx_unlock(&ifp->if_flt_lock);
5691 				return result;
5692 			}
5693 		}
5694 	}
5695 	/* we're done with the filter list */
5696 	if_flt_monitor_unbusy(ifp);
5697 	lck_mtx_unlock(&ifp->if_flt_lock);
5698 
5699 	/*
5700 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5701 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5702 	 */
5703 	if (*m_p != NULL) {
5704 		(*m_p)->m_flags &= ~M_PROTO1;
5705 	}
5706 
5707 	return 0;
5708 }
5709 
5710 __attribute__((noinline))
5711 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5712 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5713     protocol_family_t protocol_family)
5714 {
5715 	boolean_t               is_vlan_packet;
5716 	struct ifnet_filter     *filter;
5717 	struct mbuf             *m = *m_p;
5718 
5719 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5720 		return 0;
5721 	}
5722 	is_vlan_packet = packet_has_vlan_tag(m);
5723 
5724 	/*
5725 	 * Pass the outbound packet to the interface filters
5726 	 */
5727 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5728 	/* prevent filter list from changing in case we drop the lock */
5729 	if_flt_monitor_busy(ifp);
5730 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5731 		int result;
5732 
5733 		/* exclude VLAN packets from external filters PR-3586856 */
5734 		if (is_vlan_packet &&
5735 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5736 			continue;
5737 		}
5738 
5739 		if (!filter->filt_skip && filter->filt_output != NULL &&
5740 		    (filter->filt_protocol == 0 ||
5741 		    filter->filt_protocol == protocol_family)) {
5742 			lck_mtx_unlock(&ifp->if_flt_lock);
5743 
5744 			result = filter->filt_output(filter->filt_cookie, ifp,
5745 			    protocol_family, m_p);
5746 
5747 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5748 			if (result != 0) {
5749 				/* we're done with the filter list */
5750 				if_flt_monitor_unbusy(ifp);
5751 				lck_mtx_unlock(&ifp->if_flt_lock);
5752 				return result;
5753 			}
5754 		}
5755 	}
5756 	/* we're done with the filter list */
5757 	if_flt_monitor_unbusy(ifp);
5758 	lck_mtx_unlock(&ifp->if_flt_lock);
5759 
5760 	return 0;
5761 }
5762 
5763 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5764 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5765 {
5766 	int error;
5767 
5768 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5769 		/* Version 1 protocols get one packet at a time */
5770 		while (m != NULL) {
5771 			char *  frame_header;
5772 			mbuf_t  next_packet;
5773 
5774 			next_packet = m->m_nextpkt;
5775 			m->m_nextpkt = NULL;
5776 			frame_header = m->m_pkthdr.pkt_hdr;
5777 			m->m_pkthdr.pkt_hdr = NULL;
5778 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5779 			    ifproto->protocol_family, m, frame_header);
5780 			if (error != 0 && error != EJUSTRETURN) {
5781 				m_freem(m);
5782 			}
5783 			m = next_packet;
5784 		}
5785 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5786 		/* Version 2 protocols support packet lists */
5787 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5788 		    ifproto->protocol_family, m);
5789 		if (error != 0 && error != EJUSTRETURN) {
5790 			m_freem_list(m);
5791 		}
5792 	}
5793 }
5794 
5795 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5796 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5797     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5798 {
5799 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5800 
5801 	if (s->packets_in != 0) {
5802 		d->packets_in += s->packets_in;
5803 	}
5804 	if (s->bytes_in != 0) {
5805 		d->bytes_in += s->bytes_in;
5806 	}
5807 	if (s->errors_in != 0) {
5808 		d->errors_in += s->errors_in;
5809 	}
5810 
5811 	if (s->packets_out != 0) {
5812 		d->packets_out += s->packets_out;
5813 	}
5814 	if (s->bytes_out != 0) {
5815 		d->bytes_out += s->bytes_out;
5816 	}
5817 	if (s->errors_out != 0) {
5818 		d->errors_out += s->errors_out;
5819 	}
5820 
5821 	if (s->collisions != 0) {
5822 		d->collisions += s->collisions;
5823 	}
5824 	if (s->dropped != 0) {
5825 		d->dropped += s->dropped;
5826 	}
5827 
5828 	if (poll) {
5829 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5830 	}
5831 }
5832 
5833 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5834 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5835 {
5836 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5837 
5838 	/*
5839 	 * Use of atomic operations is unavoidable here because
5840 	 * these stats may also be incremented elsewhere via KPIs.
5841 	 */
5842 	if (s->packets_in != 0) {
5843 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
5844 		s->packets_in = 0;
5845 	}
5846 	if (s->bytes_in != 0) {
5847 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
5848 		s->bytes_in = 0;
5849 	}
5850 	if (s->errors_in != 0) {
5851 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
5852 		s->errors_in = 0;
5853 	}
5854 
5855 	if (s->packets_out != 0) {
5856 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
5857 		s->packets_out = 0;
5858 	}
5859 	if (s->bytes_out != 0) {
5860 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
5861 		s->bytes_out = 0;
5862 	}
5863 	if (s->errors_out != 0) {
5864 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
5865 		s->errors_out = 0;
5866 	}
5867 
5868 	if (s->collisions != 0) {
5869 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
5870 		s->collisions = 0;
5871 	}
5872 	if (s->dropped != 0) {
5873 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
5874 		s->dropped = 0;
5875 	}
5876 
5877 	/*
5878 	 * No need for atomic operations as they are modified here
5879 	 * only from within the DLIL input thread context.
5880 	 */
5881 	if (ifp->if_poll_tstats.packets != 0) {
5882 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5883 		ifp->if_poll_tstats.packets = 0;
5884 	}
5885 	if (ifp->if_poll_tstats.bytes != 0) {
5886 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5887 		ifp->if_poll_tstats.bytes = 0;
5888 	}
5889 
5890 	return ifp->if_data_threshold != 0;
5891 }
5892 
5893 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5894 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5895 {
5896 	return dlil_input_packet_list_common(ifp, m, 0,
5897 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5898 }
5899 
5900 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5901 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5902     u_int32_t cnt, ifnet_model_t mode)
5903 {
5904 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5905 }
5906 
5907 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)5908 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
5909 {
5910 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5911 	if_flt_monitor_busy(ifp);
5912 	lck_mtx_unlock(&ifp->if_flt_lock);
5913 
5914 	if (ifp->if_bridge != NULL) {
5915 		m = bridge_early_input(ifp, m, cnt);
5916 	}
5917 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5918 	if_flt_monitor_unbusy(ifp);
5919 	lck_mtx_unlock(&ifp->if_flt_lock);
5920 	return m;
5921 }
5922 
5923 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5924 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5925     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5926 {
5927 	int error = 0;
5928 	protocol_family_t protocol_family;
5929 	mbuf_t next_packet;
5930 	ifnet_t ifp = ifp_param;
5931 	char *frame_header = NULL;
5932 	struct if_proto *last_ifproto = NULL;
5933 	mbuf_t pkt_first = NULL;
5934 	mbuf_t *pkt_next = NULL;
5935 	u_int32_t poll_thresh = 0, poll_ival = 0;
5936 	int iorefcnt = 0;
5937 	boolean_t skip_bridge_filter = FALSE;
5938 
5939 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5940 
5941 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5942 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
5943 		poll_thresh = cnt;
5944 	}
5945 	if (bridge_enable_early_input != 0 &&
5946 	    ifp != NULL && ifp->if_bridge != NULL) {
5947 		m = handle_bridge_early_input(ifp, m, cnt);
5948 		skip_bridge_filter = TRUE;
5949 	}
5950 	while (m != NULL) {
5951 		struct if_proto *ifproto = NULL;
5952 		uint32_t pktf_mask;     /* pkt flags to preserve */
5953 
5954 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5955 
5956 		if (ifp_param == NULL) {
5957 			ifp = m->m_pkthdr.rcvif;
5958 		}
5959 
5960 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
5961 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5962 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5963 			ifnet_poll(ifp);
5964 		}
5965 
5966 		/* Check if this mbuf looks valid */
5967 		MBUF_INPUT_CHECK(m, ifp);
5968 
5969 		next_packet = m->m_nextpkt;
5970 		m->m_nextpkt = NULL;
5971 		frame_header = m->m_pkthdr.pkt_hdr;
5972 		m->m_pkthdr.pkt_hdr = NULL;
5973 
5974 		/*
5975 		 * Get an IO reference count if the interface is not
5976 		 * loopback (lo0) and it is attached; lo0 never goes
5977 		 * away, so optimize for that.
5978 		 */
5979 		if (ifp != lo_ifp) {
5980 			/* iorefcnt is 0 if it hasn't been taken yet */
5981 			if (iorefcnt == 0) {
5982 				if (!ifnet_datamov_begin(ifp)) {
5983 					m_freem(m);
5984 					goto next;
5985 				}
5986 			}
5987 			iorefcnt = 1;
5988 			/*
5989 			 * Preserve the time stamp and skip pktap flags.
5990 			 */
5991 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5992 		} else {
5993 			/*
5994 			 * If this arrived on lo0, preserve interface addr
5995 			 * info to allow for connectivity between loopback
5996 			 * and local interface addresses.
5997 			 */
5998 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5999 		}
6000 		pktf_mask |= PKTF_WAKE_PKT;
6001 
6002 		/* make sure packet comes in clean */
6003 		m_classifier_init(m, pktf_mask);
6004 
6005 		ifp_inc_traffic_class_in(ifp, m);
6006 
6007 		/* find which protocol family this packet is for */
6008 		ifnet_lock_shared(ifp);
6009 		error = (*ifp->if_demux)(ifp, m, frame_header,
6010 		    &protocol_family);
6011 		ifnet_lock_done(ifp);
6012 		if (error != 0) {
6013 			if (error == EJUSTRETURN) {
6014 				goto next;
6015 			}
6016 			protocol_family = 0;
6017 		}
6018 		/* check for an updated frame header */
6019 		if (m->m_pkthdr.pkt_hdr != NULL) {
6020 			frame_header = m->m_pkthdr.pkt_hdr;
6021 			m->m_pkthdr.pkt_hdr = NULL;
6022 		}
6023 
6024 #if (DEVELOPMENT || DEBUG)
6025 		/*
6026 		 * For testing we do not care about broadcast and multicast packets as
6027 		 * they are not as controllable as unicast traffic
6028 		 */
6029 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6030 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6031 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6032 				/*
6033 				 * This is a one-shot command
6034 				 */
6035 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6036 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6037 			}
6038 		}
6039 #endif /* (DEVELOPMENT || DEBUG) */
6040 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6041 			char buffer[64];
6042 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6043 
6044 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6045 			    ifp->if_xname, m_pktlen(m));
6046 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6047 				log_hexdump(buffer, buflen);
6048 			}
6049 		}
6050 
6051 		pktap_input(ifp, protocol_family, m, frame_header);
6052 
6053 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6054 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6055 		    ifp->if_type == IFT_CELLULAR) {
6056 			m_freem(m);
6057 			ip6stat.ip6s_clat464_in_v4_drop++;
6058 			goto next;
6059 		}
6060 
6061 		/* Translate the packet if it is received on CLAT interface */
6062 		if ((m->m_flags & M_PROMISC) == 0 &&
6063 		    protocol_family == PF_INET6 &&
6064 		    IS_INTF_CLAT46(ifp) &&
6065 		    dlil_is_clat_needed(protocol_family, m)) {
6066 			char *data = NULL;
6067 			struct ether_header eh;
6068 			struct ether_header *ehp = NULL;
6069 
6070 			if (ifp->if_type == IFT_ETHER) {
6071 				ehp = (struct ether_header *)(void *)frame_header;
6072 				/* Skip RX Ethernet packets if they are not IPV6 */
6073 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6074 					goto skip_clat;
6075 				}
6076 
6077 				/* Keep a copy of frame_header for Ethernet packets */
6078 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6079 			}
6080 			error = dlil_clat64(ifp, &protocol_family, &m);
6081 			data = mtod(m, char*);
6082 			if (error != 0) {
6083 				m_freem(m);
6084 				ip6stat.ip6s_clat464_in_drop++;
6085 				goto next;
6086 			}
6087 			/* Native v6 should be No-op */
6088 			if (protocol_family != PF_INET) {
6089 				goto skip_clat;
6090 			}
6091 
6092 			/* Do this only for translated v4 packets. */
6093 			switch (ifp->if_type) {
6094 			case IFT_CELLULAR:
6095 				frame_header = data;
6096 				break;
6097 			case IFT_ETHER:
6098 				/*
6099 				 * Drop if the mbuf doesn't have enough
6100 				 * space for Ethernet header
6101 				 */
6102 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6103 					m_free(m);
6104 					ip6stat.ip6s_clat464_in_drop++;
6105 					goto next;
6106 				}
6107 				/*
6108 				 * Set the frame_header ETHER_HDR_LEN bytes
6109 				 * preceeding the data pointer. Change
6110 				 * the ether_type too.
6111 				 */
6112 				frame_header = data - ETHER_HDR_LEN;
6113 				eh.ether_type = htons(ETHERTYPE_IP);
6114 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6115 				break;
6116 			}
6117 		}
6118 skip_clat:
6119 		/*
6120 		 * Match the wake packet against the list of ports that has been
6121 		 * been queried by the driver before the device went to sleep
6122 		 */
6123 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6124 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6125 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6126 			}
6127 		}
6128 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6129 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6130 			dlil_input_cksum_dbg(ifp, m, frame_header,
6131 			    protocol_family);
6132 		}
6133 		/*
6134 		 * For partial checksum offload, we expect the driver to
6135 		 * set the start offset indicating the start of the span
6136 		 * that is covered by the hardware-computed checksum;
6137 		 * adjust this start offset accordingly because the data
6138 		 * pointer has been advanced beyond the link-layer header.
6139 		 *
6140 		 * Virtual lan types (bridge, vlan, bond) can call
6141 		 * dlil_input_packet_list() with the same packet with the
6142 		 * checksum flags set. Set a flag indicating that the
6143 		 * adjustment has already been done.
6144 		 */
6145 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6146 			/* adjustment has already been done */
6147 		} else if ((m->m_pkthdr.csum_flags &
6148 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6149 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6150 			int adj;
6151 			if (frame_header == NULL ||
6152 			    frame_header < (char *)mbuf_datastart(m) ||
6153 			    frame_header > (char *)m->m_data ||
6154 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6155 			    m->m_pkthdr.csum_rx_start) {
6156 				m->m_pkthdr.csum_data = 0;
6157 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6158 				hwcksum_in_invalidated++;
6159 			} else {
6160 				m->m_pkthdr.csum_rx_start -= adj;
6161 			}
6162 			/* make sure we don't adjust more than once */
6163 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6164 		}
6165 		if (clat_debug) {
6166 			pktap_input(ifp, protocol_family, m, frame_header);
6167 		}
6168 
6169 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6170 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6171 		}
6172 
6173 		/* run interface filters */
6174 		error = dlil_interface_filters_input(ifp, &m,
6175 		    &frame_header, protocol_family, skip_bridge_filter);
6176 		if (error != 0) {
6177 			if (error != EJUSTRETURN) {
6178 				m_freem(m);
6179 			}
6180 			goto next;
6181 		}
6182 		/*
6183 		 * A VLAN and Bond interface receives packets by attaching
6184 		 * a "protocol" to the underlying interface.
6185 		 * A promiscuous packet needs to be delivered to the
6186 		 * VLAN or Bond interface since:
6187 		 * - Bond interface member may not support setting the
6188 		 *   MAC address, so packets are inherently "promiscuous"
6189 		 * - A VLAN or Bond interface could be members of a bridge,
6190 		 *   where promiscuous packets correspond to other
6191 		 *   devices that the bridge forwards packets to/from
6192 		 */
6193 		if ((m->m_flags & M_PROMISC) != 0) {
6194 			switch (protocol_family) {
6195 			case PF_VLAN:
6196 			case PF_BOND:
6197 				/* VLAN and Bond get promiscuous packets */
6198 				break;
6199 			default:
6200 				m_freem(m);
6201 				goto next;
6202 			}
6203 		}
6204 
6205 		/* Lookup the protocol attachment to this interface */
6206 		if (protocol_family == 0) {
6207 			ifproto = NULL;
6208 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6209 		    (last_ifproto->protocol_family == protocol_family)) {
6210 			VERIFY(ifproto == NULL);
6211 			ifproto = last_ifproto;
6212 			if_proto_ref(last_ifproto);
6213 		} else {
6214 			VERIFY(ifproto == NULL);
6215 			ifnet_lock_shared(ifp);
6216 			/* callee holds a proto refcnt upon success */
6217 			ifproto = find_attached_proto(ifp, protocol_family);
6218 			ifnet_lock_done(ifp);
6219 		}
6220 		if (ifproto == NULL) {
6221 			/* no protocol for this packet, discard */
6222 			m_freem(m);
6223 			goto next;
6224 		}
6225 		if (ifproto != last_ifproto) {
6226 			if (last_ifproto != NULL) {
6227 				/* pass up the list for the previous protocol */
6228 				dlil_ifproto_input(last_ifproto, pkt_first);
6229 				pkt_first = NULL;
6230 				if_proto_free(last_ifproto);
6231 			}
6232 			last_ifproto = ifproto;
6233 			if_proto_ref(ifproto);
6234 		}
6235 		/* extend the list */
6236 		m->m_pkthdr.pkt_hdr = frame_header;
6237 		if (pkt_first == NULL) {
6238 			pkt_first = m;
6239 		} else {
6240 			*pkt_next = m;
6241 		}
6242 		pkt_next = &m->m_nextpkt;
6243 
6244 next:
6245 		if (next_packet == NULL && last_ifproto != NULL) {
6246 			/* pass up the last list of packets */
6247 			dlil_ifproto_input(last_ifproto, pkt_first);
6248 			if_proto_free(last_ifproto);
6249 			last_ifproto = NULL;
6250 		}
6251 		if (ifproto != NULL) {
6252 			if_proto_free(ifproto);
6253 			ifproto = NULL;
6254 		}
6255 
6256 		m = next_packet;
6257 
6258 		/* update the driver's multicast filter, if needed */
6259 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6260 			ifp->if_updatemcasts = 0;
6261 		}
6262 		if (iorefcnt == 1) {
6263 			/* If the next mbuf is on a different interface, unlock data-mov */
6264 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6265 				ifnet_datamov_end(ifp);
6266 				iorefcnt = 0;
6267 			}
6268 		}
6269 	}
6270 
6271 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6272 }
6273 
6274 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6275 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6276 {
6277 	errno_t err;
6278 
6279 	if (sync) {
6280 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6281 		if (err == EAFNOSUPPORT) {
6282 			err = 0;
6283 		}
6284 	} else {
6285 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6286 		err = 0;
6287 	}
6288 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6289 	    "(err=%d)\n", if_name(ifp),
6290 	    (err == 0 ? "successfully restored" : "failed to restore"),
6291 	    ifp->if_updatemcasts, err);
6292 
6293 	/* just return success */
6294 	return 0;
6295 }
6296 
6297 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6298 if_mcasts_update_async(struct ifnet *ifp)
6299 {
6300 	return if_mcasts_update_common(ifp, false);
6301 }
6302 
6303 errno_t
if_mcasts_update(struct ifnet * ifp)6304 if_mcasts_update(struct ifnet *ifp)
6305 {
6306 	return if_mcasts_update_common(ifp, true);
6307 }
6308 
6309 /* If ifp is set, we will increment the generation for the interface */
6310 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6311 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6312 {
6313 	if (ifp != NULL) {
6314 		ifnet_increment_generation(ifp);
6315 	}
6316 
6317 #if NECP
6318 	necp_update_all_clients();
6319 #endif /* NECP */
6320 
6321 	return kev_post_msg(event);
6322 }
6323 
6324 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6325 dlil_post_sifflags_msg(struct ifnet * ifp)
6326 {
6327 	struct kev_msg ev_msg;
6328 	struct net_event_data ev_data;
6329 
6330 	bzero(&ev_data, sizeof(ev_data));
6331 	bzero(&ev_msg, sizeof(ev_msg));
6332 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6333 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6334 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6335 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6336 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6337 	ev_data.if_family = ifp->if_family;
6338 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6339 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6340 	ev_msg.dv[0].data_ptr = &ev_data;
6341 	ev_msg.dv[1].data_length = 0;
6342 	dlil_post_complete_msg(ifp, &ev_msg);
6343 }
6344 
6345 #define TMP_IF_PROTO_ARR_SIZE   10
6346 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6347 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6348 {
6349 	struct ifnet_filter *filter = NULL;
6350 	struct if_proto *proto = NULL;
6351 	int if_proto_count = 0;
6352 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6353 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6354 	int tmp_ifproto_arr_idx = 0;
6355 
6356 	/*
6357 	 * Pass the event to the interface filters
6358 	 */
6359 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6360 	/* prevent filter list from changing in case we drop the lock */
6361 	if_flt_monitor_busy(ifp);
6362 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6363 		if (filter->filt_event != NULL) {
6364 			lck_mtx_unlock(&ifp->if_flt_lock);
6365 
6366 			filter->filt_event(filter->filt_cookie, ifp,
6367 			    filter->filt_protocol, event);
6368 
6369 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6370 		}
6371 	}
6372 	/* we're done with the filter list */
6373 	if_flt_monitor_unbusy(ifp);
6374 	lck_mtx_unlock(&ifp->if_flt_lock);
6375 
6376 	/* Get an io ref count if the interface is attached */
6377 	if (!ifnet_is_attached(ifp, 1)) {
6378 		goto done;
6379 	}
6380 
6381 	/*
6382 	 * An embedded tmp_list_entry in if_proto may still get
6383 	 * over-written by another thread after giving up ifnet lock,
6384 	 * therefore we are avoiding embedded pointers here.
6385 	 */
6386 	ifnet_lock_shared(ifp);
6387 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6388 	if (if_proto_count) {
6389 		int i;
6390 		VERIFY(ifp->if_proto_hash != NULL);
6391 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6392 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6393 		} else {
6394 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6395 			    if_proto_count, Z_WAITOK | Z_ZERO);
6396 			if (tmp_ifproto_arr == NULL) {
6397 				ifnet_lock_done(ifp);
6398 				goto cleanup;
6399 			}
6400 		}
6401 
6402 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6403 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6404 			    next_hash) {
6405 				if_proto_ref(proto);
6406 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6407 				tmp_ifproto_arr_idx++;
6408 			}
6409 		}
6410 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6411 	}
6412 	ifnet_lock_done(ifp);
6413 
6414 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6415 	    tmp_ifproto_arr_idx++) {
6416 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6417 		VERIFY(proto != NULL);
6418 		proto_media_event eventp =
6419 		    (proto->proto_kpi == kProtoKPI_v1 ?
6420 		    proto->kpi.v1.event :
6421 		    proto->kpi.v2.event);
6422 
6423 		if (eventp != NULL) {
6424 			eventp(ifp, proto->protocol_family,
6425 			    event);
6426 		}
6427 		if_proto_free(proto);
6428 	}
6429 
6430 cleanup:
6431 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6432 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6433 	}
6434 
6435 	/* Pass the event to the interface */
6436 	if (ifp->if_event != NULL) {
6437 		ifp->if_event(ifp, event);
6438 	}
6439 
6440 	/* Release the io ref count */
6441 	ifnet_decr_iorefcnt(ifp);
6442 done:
6443 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6444 }
6445 
6446 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6447 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6448 {
6449 	struct kev_msg kev_msg;
6450 	int result = 0;
6451 
6452 	if (ifp == NULL || event == NULL) {
6453 		return EINVAL;
6454 	}
6455 
6456 	bzero(&kev_msg, sizeof(kev_msg));
6457 	kev_msg.vendor_code = event->vendor_code;
6458 	kev_msg.kev_class = event->kev_class;
6459 	kev_msg.kev_subclass = event->kev_subclass;
6460 	kev_msg.event_code = event->event_code;
6461 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6462 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6463 	kev_msg.dv[1].data_length = 0;
6464 
6465 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6466 
6467 	return result;
6468 }
6469 
6470 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6471 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6472 {
6473 	mbuf_t  n = m;
6474 	int chainlen = 0;
6475 
6476 	while (n != NULL) {
6477 		chainlen++;
6478 		n = n->m_next;
6479 	}
6480 	switch (chainlen) {
6481 	case 0:
6482 		break;
6483 	case 1:
6484 		os_atomic_inc(&cls->cls_one, relaxed);
6485 		break;
6486 	case 2:
6487 		os_atomic_inc(&cls->cls_two, relaxed);
6488 		break;
6489 	case 3:
6490 		os_atomic_inc(&cls->cls_three, relaxed);
6491 		break;
6492 	case 4:
6493 		os_atomic_inc(&cls->cls_four, relaxed);
6494 		break;
6495 	case 5:
6496 	default:
6497 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6498 		break;
6499 	}
6500 }
6501 
6502 #if CONFIG_DTRACE
6503 __attribute__((noinline))
6504 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6505 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6506 {
6507 	if (proto_family == PF_INET) {
6508 		struct ip *ip = mtod(m, struct ip *);
6509 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6510 		    struct ip *, ip, struct ifnet *, ifp,
6511 		    struct ip *, ip, struct ip6_hdr *, NULL);
6512 	} else if (proto_family == PF_INET6) {
6513 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6514 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6515 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6516 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6517 	}
6518 }
6519 #endif /* CONFIG_DTRACE */
6520 
6521 /*
6522  * dlil_output
6523  *
6524  * Caller should have a lock on the protocol domain if the protocol
6525  * doesn't support finer grained locking. In most cases, the lock
6526  * will be held from the socket layer and won't be released until
6527  * we return back to the socket layer.
6528  *
6529  * This does mean that we must take a protocol lock before we take
6530  * an interface lock if we're going to take both. This makes sense
6531  * because a protocol is likely to interact with an ifp while it
6532  * is under the protocol lock.
6533  *
6534  * An advisory code will be returned if adv is not null. This
6535  * can be used to provide feedback about interface queues to the
6536  * application.
6537  */
6538 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int flags,struct flowadv * adv)6539 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6540     void *route, const struct sockaddr *dest, int flags, struct flowadv *adv)
6541 {
6542 	char *frame_type = NULL;
6543 	char *dst_linkaddr = NULL;
6544 	int retval = 0;
6545 	char frame_type_buffer[DLIL_MAX_FRAME_TYPE_BUFFER_SIZE];
6546 	char dst_linkaddr_buffer[DLIL_MAX_LINKADDR_BUFFER_SIZE];
6547 	struct if_proto *proto = NULL;
6548 	mbuf_t  m = NULL;
6549 	mbuf_t  send_head = NULL;
6550 	mbuf_t  *send_tail = &send_head;
6551 	int iorefcnt = 0;
6552 	u_int32_t pre = 0, post = 0;
6553 	u_int32_t fpkts = 0, fbytes = 0;
6554 	int32_t flen = 0;
6555 	struct timespec now;
6556 	u_int64_t now_nsec;
6557 	boolean_t did_clat46 = FALSE;
6558 	protocol_family_t old_proto_family = proto_family;
6559 	struct sockaddr_in6 dest6;
6560 	struct rtentry *rt = NULL;
6561 	u_int16_t m_loop_set = 0;
6562 	bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0;
6563 
6564 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6565 
6566 	/*
6567 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6568 	 * from happening while this operation is in progress
6569 	 */
6570 	if (!ifnet_datamov_begin(ifp)) {
6571 		retval = ENXIO;
6572 		goto cleanup;
6573 	}
6574 	iorefcnt = 1;
6575 
6576 	VERIFY(ifp->if_output_dlil != NULL);
6577 
6578 	/* update the driver's multicast filter, if needed */
6579 	if (ifp->if_updatemcasts > 0) {
6580 		if_mcasts_update_async(ifp);
6581 		ifp->if_updatemcasts = 0;
6582 	}
6583 
6584 	frame_type = frame_type_buffer;
6585 	dst_linkaddr = dst_linkaddr_buffer;
6586 
6587 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6588 		ifnet_lock_shared(ifp);
6589 		/* callee holds a proto refcnt upon success */
6590 		proto = find_attached_proto(ifp, proto_family);
6591 		if (proto == NULL) {
6592 			ifnet_lock_done(ifp);
6593 			retval = ENXIO;
6594 			goto cleanup;
6595 		}
6596 		ifnet_lock_done(ifp);
6597 	}
6598 
6599 preout_again:
6600 	if (packetlist == NULL) {
6601 		goto cleanup;
6602 	}
6603 
6604 	m = packetlist;
6605 	packetlist = packetlist->m_nextpkt;
6606 	m->m_nextpkt = NULL;
6607 
6608 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6609 
6610 	/*
6611 	 * Perform address family translation for the first
6612 	 * packet outside the loop in order to perform address
6613 	 * lookup for the translated proto family.
6614 	 */
6615 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6616 	    (ifp->if_type == IFT_CELLULAR ||
6617 	    dlil_is_clat_needed(proto_family, m))) {
6618 		retval = dlil_clat46(ifp, &proto_family, &m);
6619 		/*
6620 		 * Go to the next packet if translation fails
6621 		 */
6622 		if (retval != 0) {
6623 			m_freem(m);
6624 			m = NULL;
6625 			ip6stat.ip6s_clat464_out_drop++;
6626 			/* Make sure that the proto family is PF_INET */
6627 			ASSERT(proto_family == PF_INET);
6628 			goto preout_again;
6629 		}
6630 		/*
6631 		 * Free the old one and make it point to the IPv6 proto structure.
6632 		 *
6633 		 * Change proto for the first time we have successfully
6634 		 * performed address family translation.
6635 		 */
6636 		if (!did_clat46 && proto_family == PF_INET6) {
6637 			did_clat46 = TRUE;
6638 
6639 			if (proto != NULL) {
6640 				if_proto_free(proto);
6641 			}
6642 			ifnet_lock_shared(ifp);
6643 			/* callee holds a proto refcnt upon success */
6644 			proto = find_attached_proto(ifp, proto_family);
6645 			if (proto == NULL) {
6646 				ifnet_lock_done(ifp);
6647 				retval = ENXIO;
6648 				m_freem(m);
6649 				m = NULL;
6650 				goto cleanup;
6651 			}
6652 			ifnet_lock_done(ifp);
6653 			if (ifp->if_type == IFT_ETHER) {
6654 				/* Update the dest to translated v6 address */
6655 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6656 				dest6.sin6_family = AF_INET6;
6657 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6658 				dest = SA(&dest6);
6659 
6660 				/*
6661 				 * Lookup route to the translated destination
6662 				 * Free this route ref during cleanup
6663 				 */
6664 				rt = rtalloc1_scoped(SA(&dest6),
6665 				    0, 0, ifp->if_index);
6666 
6667 				route = rt;
6668 			}
6669 		}
6670 	}
6671 
6672 	/*
6673 	 * This path gets packet chain going to the same destination.
6674 	 * The pre output routine is used to either trigger resolution of
6675 	 * the next hop or retrieve the next hop's link layer addressing.
6676 	 * For ex: ether_inet(6)_pre_output routine.
6677 	 *
6678 	 * If the routine returns EJUSTRETURN, it implies that packet has
6679 	 * been queued, and therefore we have to call preout_again for the
6680 	 * following packet in the chain.
6681 	 *
6682 	 * For errors other than EJUSTRETURN, the current packet is freed
6683 	 * and the rest of the chain (pointed by packetlist is freed as
6684 	 * part of clean up.
6685 	 *
6686 	 * Else if there is no error the retrieved information is used for
6687 	 * all the packets in the chain.
6688 	 */
6689 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6690 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6691 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6692 		retval = 0;
6693 		if (preoutp != NULL) {
6694 			retval = preoutp(ifp, proto_family, &m, dest, route,
6695 			    frame_type, dst_linkaddr);
6696 
6697 			if (retval != 0) {
6698 				if (retval == EJUSTRETURN) {
6699 					goto preout_again;
6700 				}
6701 				m_freem(m);
6702 				m = NULL;
6703 				goto cleanup;
6704 			}
6705 		}
6706 	}
6707 
6708 	nanouptime(&now);
6709 	net_timernsec(&now, &now_nsec);
6710 
6711 	do {
6712 		/*
6713 		 * pkt_hdr is set here to point to m_data prior to
6714 		 * calling into the framer. This value of pkt_hdr is
6715 		 * used by the netif gso logic to retrieve the ip header
6716 		 * for the TCP packets, offloaded for TSO processing.
6717 		 */
6718 		if (raw && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6719 			uint8_t vlan_encap_len = 0;
6720 
6721 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6722 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6723 			}
6724 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6725 		} else {
6726 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6727 		}
6728 
6729 		/*
6730 		 * Perform address family translation if needed.
6731 		 * For now we only support stateless 4 to 6 translation
6732 		 * on the out path.
6733 		 *
6734 		 * The routine below translates IP header, updates protocol
6735 		 * checksum and also translates ICMP.
6736 		 *
6737 		 * We skip the first packet as it is already translated and
6738 		 * the proto family is set to PF_INET6.
6739 		 */
6740 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6741 		    (ifp->if_type == IFT_CELLULAR ||
6742 		    dlil_is_clat_needed(proto_family, m))) {
6743 			retval = dlil_clat46(ifp, &proto_family, &m);
6744 			/* Goto the next packet if the translation fails */
6745 			if (retval != 0) {
6746 				m_freem(m);
6747 				m = NULL;
6748 				ip6stat.ip6s_clat464_out_drop++;
6749 				goto next;
6750 			}
6751 		}
6752 
6753 #if CONFIG_DTRACE
6754 		if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6755 			dlil_output_dtrace(ifp, proto_family, m);
6756 		}
6757 #endif /* CONFIG_DTRACE */
6758 
6759 		if (flags == DLIL_OUTPUT_FLAGS_NONE && ifp->if_framer != NULL) {
6760 			int rcvif_set = 0;
6761 
6762 			/*
6763 			 * If this is a broadcast packet that needs to be
6764 			 * looped back into the system, set the inbound ifp
6765 			 * to that of the outbound ifp.  This will allow
6766 			 * us to determine that it is a legitimate packet
6767 			 * for the system.  Only set the ifp if it's not
6768 			 * already set, just to be safe.
6769 			 */
6770 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6771 			    m->m_pkthdr.rcvif == NULL) {
6772 				m->m_pkthdr.rcvif = ifp;
6773 				rcvif_set = 1;
6774 			}
6775 			m_loop_set = m->m_flags & M_LOOP;
6776 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6777 			    frame_type, &pre, &post);
6778 			if (retval != 0) {
6779 				if (retval != EJUSTRETURN) {
6780 					m_freem(m);
6781 				}
6782 				goto next;
6783 			}
6784 
6785 			/*
6786 			 * For partial checksum offload, adjust the start
6787 			 * and stuff offsets based on the prepended header.
6788 			 */
6789 			if ((m->m_pkthdr.csum_flags &
6790 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6791 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6792 				m->m_pkthdr.csum_tx_stuff += pre;
6793 				m->m_pkthdr.csum_tx_start += pre;
6794 			}
6795 
6796 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6797 				dlil_output_cksum_dbg(ifp, m, pre,
6798 				    proto_family);
6799 			}
6800 
6801 			/*
6802 			 * Clear the ifp if it was set above, and to be
6803 			 * safe, only if it is still the same as the
6804 			 * outbound ifp we have in context.  If it was
6805 			 * looped back, then a copy of it was sent to the
6806 			 * loopback interface with the rcvif set, and we
6807 			 * are clearing the one that will go down to the
6808 			 * layer below.
6809 			 */
6810 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6811 				m->m_pkthdr.rcvif = NULL;
6812 			}
6813 		}
6814 
6815 		/*
6816 		 * Let interface filters (if any) do their thing ...
6817 		 */
6818 		if ((flags & DLIL_OUTPUT_FLAGS_SKIP_IF_FILTERS) == 0) {
6819 			retval = dlil_interface_filters_output(ifp, &m, proto_family);
6820 			if (retval != 0) {
6821 				if (retval != EJUSTRETURN) {
6822 					m_freem(m);
6823 				}
6824 				goto next;
6825 			}
6826 		}
6827 		/*
6828 		 * Strip away M_PROTO1 bit prior to sending packet
6829 		 * to the driver as this field may be used by the driver
6830 		 */
6831 		m->m_flags &= ~M_PROTO1;
6832 
6833 		/*
6834 		 * If the underlying interface is not capable of handling a
6835 		 * packet whose data portion spans across physically disjoint
6836 		 * pages, we need to "normalize" the packet so that we pass
6837 		 * down a chain of mbufs where each mbuf points to a span that
6838 		 * resides in the system page boundary.  If the packet does
6839 		 * not cross page(s), the following is a no-op.
6840 		 */
6841 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6842 			if ((m = m_normalize(m)) == NULL) {
6843 				goto next;
6844 			}
6845 		}
6846 
6847 		/*
6848 		 * If this is a TSO packet, make sure the interface still
6849 		 * advertise TSO capability.
6850 		 */
6851 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6852 			retval = EMSGSIZE;
6853 			m_freem(m);
6854 			goto cleanup;
6855 		}
6856 
6857 		ifp_inc_traffic_class_out(ifp, m);
6858 
6859 #if SKYWALK
6860 		/*
6861 		 * For native skywalk devices, packets will be passed to pktap
6862 		 * after GSO or after the mbuf to packet conversion.
6863 		 * This is done for IPv4/IPv6 packets only because there is no
6864 		 * space in the mbuf to pass down the proto family.
6865 		 */
6866 		if (dlil_is_native_netif_nexus(ifp)) {
6867 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6868 				pktap_output(ifp, proto_family, m, pre, post);
6869 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6870 			}
6871 		} else {
6872 			pktap_output(ifp, proto_family, m, pre, post);
6873 		}
6874 #else /* SKYWALK */
6875 		pktap_output(ifp, proto_family, m, pre, post);
6876 #endif /* SKYWALK */
6877 
6878 		/*
6879 		 * Count the number of elements in the mbuf chain
6880 		 */
6881 		if (tx_chain_len_count) {
6882 			dlil_count_chain_len(m, &tx_chain_len_stats);
6883 		}
6884 
6885 		/*
6886 		 * Discard partial sum information if this packet originated
6887 		 * from another interface; the packet would already have the
6888 		 * final checksum and we shouldn't recompute it.
6889 		 */
6890 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6891 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6892 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6893 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6894 			m->m_pkthdr.csum_data = 0;
6895 		}
6896 
6897 		/*
6898 		 * Finally, call the driver.
6899 		 */
6900 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6901 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6902 				flen += (m_pktlen(m) - (pre + post));
6903 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6904 			}
6905 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6906 
6907 			*send_tail = m;
6908 			send_tail = &m->m_nextpkt;
6909 		} else {
6910 			/*
6911 			 * Record timestamp; ifnet_enqueue() will use this info
6912 			 * rather than redoing the work.
6913 			 */
6914 			nanouptime(&now);
6915 			net_timernsec(&now, &now_nsec);
6916 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6917 
6918 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6919 				flen = (m_pktlen(m) - (pre + post));
6920 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6921 			} else {
6922 				flen = 0;
6923 			}
6924 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6925 			    0, 0, 0, 0, 0);
6926 			retval = (*ifp->if_output_dlil)(ifp, m);
6927 			if (retval == EQFULL || retval == EQSUSPENDED) {
6928 				if (adv != NULL && adv->code == FADV_SUCCESS) {
6929 					adv->code = (retval == EQFULL ?
6930 					    FADV_FLOW_CONTROLLED :
6931 					    FADV_SUSPENDED);
6932 				}
6933 				retval = 0;
6934 			}
6935 			if (retval == 0 && flen > 0) {
6936 				fbytes += flen;
6937 				fpkts++;
6938 			}
6939 			if (retval != 0 && dlil_verbose) {
6940 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6941 				    __func__, if_name(ifp),
6942 				    retval);
6943 			}
6944 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6945 			    0, 0, 0, 0, 0);
6946 		}
6947 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6948 
6949 next:
6950 		m = packetlist;
6951 		if (m != NULL) {
6952 			m->m_flags |= m_loop_set;
6953 			packetlist = packetlist->m_nextpkt;
6954 			m->m_nextpkt = NULL;
6955 		}
6956 		/* Reset the proto family to old proto family for CLAT */
6957 		if (did_clat46) {
6958 			proto_family = old_proto_family;
6959 		}
6960 	} while (m != NULL);
6961 
6962 	if (send_head != NULL) {
6963 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6964 		    0, 0, 0, 0, 0);
6965 		if (ifp->if_eflags & IFEF_SENDLIST) {
6966 			retval = (*ifp->if_output_dlil)(ifp, send_head);
6967 			if (retval == EQFULL || retval == EQSUSPENDED) {
6968 				if (adv != NULL) {
6969 					adv->code = (retval == EQFULL ?
6970 					    FADV_FLOW_CONTROLLED :
6971 					    FADV_SUSPENDED);
6972 				}
6973 				retval = 0;
6974 			}
6975 			if (retval == 0 && flen > 0) {
6976 				fbytes += flen;
6977 				fpkts++;
6978 			}
6979 			if (retval != 0 && dlil_verbose) {
6980 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6981 				    __func__, if_name(ifp), retval);
6982 			}
6983 		} else {
6984 			struct mbuf *send_m;
6985 			int enq_cnt = 0;
6986 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6987 			while (send_head != NULL) {
6988 				send_m = send_head;
6989 				send_head = send_m->m_nextpkt;
6990 				send_m->m_nextpkt = NULL;
6991 				retval = (*ifp->if_output_dlil)(ifp, send_m);
6992 				if (retval == EQFULL || retval == EQSUSPENDED) {
6993 					if (adv != NULL) {
6994 						adv->code = (retval == EQFULL ?
6995 						    FADV_FLOW_CONTROLLED :
6996 						    FADV_SUSPENDED);
6997 					}
6998 					retval = 0;
6999 				}
7000 				if (retval == 0) {
7001 					enq_cnt++;
7002 					if (flen > 0) {
7003 						fpkts++;
7004 					}
7005 				}
7006 				if (retval != 0 && dlil_verbose) {
7007 					DLIL_PRINTF("%s: output error on %s "
7008 					    "retval = %d\n",
7009 					    __func__, if_name(ifp), retval);
7010 				}
7011 			}
7012 			if (enq_cnt > 0) {
7013 				fbytes += flen;
7014 				ifnet_start(ifp);
7015 			}
7016 		}
7017 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7018 	}
7019 
7020 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7021 
7022 cleanup:
7023 	if (fbytes > 0) {
7024 		ifp->if_fbytes += fbytes;
7025 	}
7026 	if (fpkts > 0) {
7027 		ifp->if_fpackets += fpkts;
7028 	}
7029 	if (proto != NULL) {
7030 		if_proto_free(proto);
7031 	}
7032 	if (packetlist) { /* if any packets are left, clean up */
7033 		mbuf_freem_list(packetlist);
7034 	}
7035 	if (retval == EJUSTRETURN) {
7036 		retval = 0;
7037 	}
7038 	if (iorefcnt == 1) {
7039 		ifnet_datamov_end(ifp);
7040 	}
7041 	if (rt != NULL) {
7042 		rtfree(rt);
7043 		rt = NULL;
7044 	}
7045 
7046 	return retval;
7047 }
7048 
7049 /*
7050  * This routine checks if the destination address is not a loopback, link-local,
7051  * multicast or broadcast address.
7052  */
7053 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7054 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7055 {
7056 	int ret = 0;
7057 	switch (proto_family) {
7058 	case PF_INET: {
7059 		struct ip *iph = mtod(m, struct ip *);
7060 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7061 			ret = 1;
7062 		}
7063 		break;
7064 	}
7065 	case PF_INET6: {
7066 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7067 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7068 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7069 			ret = 1;
7070 		}
7071 		break;
7072 	}
7073 	}
7074 
7075 	return ret;
7076 }
7077 /*
7078  * @brief This routine translates IPv4 packet to IPv6 packet,
7079  *     updates protocol checksum and also translates ICMP for code
7080  *     along with inner header translation.
7081  *
7082  * @param ifp Pointer to the interface
7083  * @param proto_family pointer to protocol family. It is updated if function
7084  *     performs the translation successfully.
7085  * @param m Pointer to the pointer pointing to the packet. Needed because this
7086  *     routine can end up changing the mbuf to a different one.
7087  *
7088  * @return 0 on success or else a negative value.
7089  */
7090 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7091 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7092 {
7093 	VERIFY(*proto_family == PF_INET);
7094 	VERIFY(IS_INTF_CLAT46(ifp));
7095 
7096 	pbuf_t pbuf_store, *pbuf = NULL;
7097 	struct ip *iph = NULL;
7098 	struct in_addr osrc, odst;
7099 	uint8_t proto = 0;
7100 	struct in6_addr src_storage = {};
7101 	struct in6_addr *src = NULL;
7102 	struct sockaddr_in6 dstsock = {};
7103 	int error = 0;
7104 	uint16_t off = 0;
7105 	uint16_t tot_len = 0;
7106 	uint16_t ip_id_val = 0;
7107 	uint16_t ip_frag_off = 0;
7108 
7109 	boolean_t is_frag = FALSE;
7110 	boolean_t is_first_frag = TRUE;
7111 	boolean_t is_last_frag = TRUE;
7112 
7113 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7114 	pbuf = &pbuf_store;
7115 	iph = pbuf->pb_data;
7116 
7117 	osrc = iph->ip_src;
7118 	odst = iph->ip_dst;
7119 	proto = iph->ip_p;
7120 	off = (uint16_t)(iph->ip_hl << 2);
7121 	ip_id_val = iph->ip_id;
7122 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7123 
7124 	tot_len = ntohs(iph->ip_len);
7125 
7126 	/*
7127 	 * For packets that are not first frags
7128 	 * we only need to adjust CSUM.
7129 	 * For 4 to 6, Fragmentation header gets appended
7130 	 * after proto translation.
7131 	 */
7132 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7133 		is_frag = TRUE;
7134 
7135 		/* If the offset is not zero, it is not first frag */
7136 		if (ip_frag_off != 0) {
7137 			is_first_frag = FALSE;
7138 		}
7139 
7140 		/* If IP_MF is set, then it is not last frag */
7141 		if (ntohs(iph->ip_off) & IP_MF) {
7142 			is_last_frag = FALSE;
7143 		}
7144 	}
7145 
7146 	/*
7147 	 * Translate IPv4 destination to IPv6 destination by using the
7148 	 * prefixes learned through prior PLAT discovery.
7149 	 */
7150 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7151 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7152 		goto cleanup;
7153 	}
7154 
7155 	dstsock.sin6_len = sizeof(struct sockaddr_in6);
7156 	dstsock.sin6_family = AF_INET6;
7157 
7158 	/*
7159 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7160 	 * translation.
7161 	 */
7162 	src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7163 	    NULL, NULL, TRUE);
7164 
7165 	if (src == NULL) {
7166 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7167 		error = -1;
7168 		goto cleanup;
7169 	}
7170 
7171 
7172 	/* Translate the IP header part first */
7173 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7174 	    iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7175 
7176 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7177 
7178 	if (error != 0) {
7179 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7180 		goto cleanup;
7181 	}
7182 
7183 	/*
7184 	 * Translate protocol header, update checksum, checksum flags
7185 	 * and related fields.
7186 	 */
7187 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7188 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7189 
7190 	if (error != 0) {
7191 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7192 		goto cleanup;
7193 	}
7194 
7195 	/* Now insert the IPv6 fragment header */
7196 	if (is_frag) {
7197 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7198 
7199 		if (error != 0) {
7200 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7201 			goto cleanup;
7202 		}
7203 	}
7204 
7205 cleanup:
7206 	if (pbuf_is_valid(pbuf)) {
7207 		*m = pbuf->pb_mbuf;
7208 		pbuf->pb_mbuf = NULL;
7209 		pbuf_destroy(pbuf);
7210 	} else {
7211 		error = -1;
7212 		*m = NULL;
7213 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7214 	}
7215 
7216 	if (error == 0) {
7217 		*proto_family = PF_INET6;
7218 		ip6stat.ip6s_clat464_out_success++;
7219 	}
7220 
7221 	return error;
7222 }
7223 
7224 /*
7225  * @brief This routine translates incoming IPv6 to IPv4 packet,
7226  *     updates protocol checksum and also translates ICMPv6 outer
7227  *     and inner headers
7228  *
7229  * @return 0 on success or else a negative value.
7230  */
7231 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7232 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7233 {
7234 	VERIFY(*proto_family == PF_INET6);
7235 	VERIFY(IS_INTF_CLAT46(ifp));
7236 
7237 	struct ip6_hdr *ip6h = NULL;
7238 	struct in6_addr osrc, odst;
7239 	uint8_t proto = 0;
7240 	struct in6_ifaddr *ia6_clat_dst = NULL;
7241 	struct in_ifaddr *ia4_clat_dst = NULL;
7242 	struct in_addr *dst = NULL;
7243 	struct in_addr src;
7244 	int error = 0;
7245 	uint32_t off = 0;
7246 	u_int64_t tot_len = 0;
7247 	uint8_t tos = 0;
7248 	boolean_t is_first_frag = TRUE;
7249 
7250 	/* Incoming mbuf does not contain valid IP6 header */
7251 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7252 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7253 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7254 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7255 		return -1;
7256 	}
7257 
7258 	ip6h = mtod(*m, struct ip6_hdr *);
7259 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7260 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7261 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7262 		return -1;
7263 	}
7264 
7265 	osrc = ip6h->ip6_src;
7266 	odst = ip6h->ip6_dst;
7267 
7268 	/*
7269 	 * Retrieve the local CLAT46 reserved IPv6 address.
7270 	 * Let the packet pass if we don't find one, as the flag
7271 	 * may get set before IPv6 configuration has taken place.
7272 	 */
7273 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7274 	if (ia6_clat_dst == NULL) {
7275 		goto done;
7276 	}
7277 
7278 	/*
7279 	 * Check if the original dest in the packet is same as the reserved
7280 	 * CLAT46 IPv6 address
7281 	 */
7282 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7283 		pbuf_t pbuf_store, *pbuf = NULL;
7284 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7285 		pbuf = &pbuf_store;
7286 
7287 		/*
7288 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7289 		 * translation.
7290 		 */
7291 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7292 		if (ia4_clat_dst == NULL) {
7293 			ifa_remref(&ia6_clat_dst->ia_ifa);
7294 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7295 			error = -1;
7296 			goto cleanup;
7297 		}
7298 		ifa_remref(&ia6_clat_dst->ia_ifa);
7299 
7300 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7301 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7302 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7303 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7304 			error = -1;
7305 			goto cleanup;
7306 		}
7307 
7308 		ip6h = pbuf->pb_data;
7309 		off = sizeof(struct ip6_hdr);
7310 		proto = ip6h->ip6_nxt;
7311 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7312 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7313 
7314 		/*
7315 		 * Translate the IP header and update the fragmentation
7316 		 * header if needed
7317 		 */
7318 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7319 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7320 		    0 : -1;
7321 
7322 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7323 
7324 		if (error != 0) {
7325 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7326 			goto cleanup;
7327 		}
7328 
7329 		/*
7330 		 * Translate protocol header, update checksum, checksum flags
7331 		 * and related fields.
7332 		 */
7333 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7334 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7335 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7336 
7337 		if (error != 0) {
7338 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7339 			goto cleanup;
7340 		}
7341 
7342 cleanup:
7343 		if (ia4_clat_dst != NULL) {
7344 			ifa_remref(&ia4_clat_dst->ia_ifa);
7345 		}
7346 
7347 		if (pbuf_is_valid(pbuf)) {
7348 			*m = pbuf->pb_mbuf;
7349 			pbuf->pb_mbuf = NULL;
7350 			pbuf_destroy(pbuf);
7351 		} else {
7352 			error = -1;
7353 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7354 		}
7355 
7356 		if (error == 0) {
7357 			*proto_family = PF_INET;
7358 			ip6stat.ip6s_clat464_in_success++;
7359 		}
7360 	} /* CLAT traffic */
7361 
7362 done:
7363 	return error;
7364 }
7365 
7366 /* The following is used to enqueue work items for ifnet ioctl events */
7367 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7368 
7369 struct ifnet_ioctl_event {
7370 	struct ifnet *ifp;
7371 	u_long ioctl_code;
7372 };
7373 
7374 struct ifnet_ioctl_event_nwk_wq_entry {
7375 	struct nwk_wq_entry nwk_wqe;
7376 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7377 };
7378 
7379 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7380 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7381 {
7382 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7383 	bool compare_expected;
7384 
7385 	/*
7386 	 * Get an io ref count if the interface is attached.
7387 	 * At this point it most likely is. We are taking a reference for
7388 	 * deferred processing.
7389 	 */
7390 	if (!ifnet_is_attached(ifp, 1)) {
7391 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7392 		    "is not attached",
7393 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7394 		return;
7395 	}
7396 	switch (ioctl_code) {
7397 	case SIOCADDMULTI:
7398 		compare_expected = false;
7399 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7400 			ifnet_decr_iorefcnt(ifp);
7401 			return;
7402 		}
7403 		break;
7404 	case SIOCDELMULTI:
7405 		compare_expected = false;
7406 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7407 			ifnet_decr_iorefcnt(ifp);
7408 			return;
7409 		}
7410 		break;
7411 	default:
7412 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7413 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7414 		return;
7415 	}
7416 
7417 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7418 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7419 
7420 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7421 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7422 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7423 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7424 }
7425 
7426 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7427 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7428 {
7429 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7430 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7431 
7432 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7433 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7434 	int ret = 0;
7435 
7436 	switch (ioctl_code) {
7437 	case SIOCADDMULTI:
7438 		atomic_store(&ifp->if_mcast_add_signaled, false);
7439 		break;
7440 	case SIOCDELMULTI:
7441 		atomic_store(&ifp->if_mcast_del_signaled, false);
7442 		break;
7443 	}
7444 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7445 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7446 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7447 	} else if (dlil_verbose) {
7448 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7449 		    "for ioctl %lu",
7450 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7451 	}
7452 	ifnet_decr_iorefcnt(ifp);
7453 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7454 	return;
7455 }
7456 
7457 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7458 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7459     void *ioctl_arg)
7460 {
7461 	struct ifnet_filter *filter;
7462 	int retval = EOPNOTSUPP;
7463 	int result = 0;
7464 
7465 	if (ifp == NULL || ioctl_code == 0) {
7466 		return EINVAL;
7467 	}
7468 
7469 	/* Get an io ref count if the interface is attached */
7470 	if (!ifnet_is_attached(ifp, 1)) {
7471 		return EOPNOTSUPP;
7472 	}
7473 
7474 	/*
7475 	 * Run the interface filters first.
7476 	 * We want to run all filters before calling the protocol,
7477 	 * interface family, or interface.
7478 	 */
7479 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7480 	/* prevent filter list from changing in case we drop the lock */
7481 	if_flt_monitor_busy(ifp);
7482 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7483 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7484 		    filter->filt_protocol == proto_fam)) {
7485 			lck_mtx_unlock(&ifp->if_flt_lock);
7486 
7487 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7488 			    proto_fam, ioctl_code, ioctl_arg);
7489 
7490 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7491 
7492 			/* Only update retval if no one has handled the ioctl */
7493 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7494 				if (result == ENOTSUP) {
7495 					result = EOPNOTSUPP;
7496 				}
7497 				retval = result;
7498 				if (retval != 0 && retval != EOPNOTSUPP) {
7499 					/* we're done with the filter list */
7500 					if_flt_monitor_unbusy(ifp);
7501 					lck_mtx_unlock(&ifp->if_flt_lock);
7502 					goto cleanup;
7503 				}
7504 			}
7505 		}
7506 	}
7507 	/* we're done with the filter list */
7508 	if_flt_monitor_unbusy(ifp);
7509 	lck_mtx_unlock(&ifp->if_flt_lock);
7510 
7511 	/* Allow the protocol to handle the ioctl */
7512 	if (proto_fam != 0) {
7513 		struct if_proto *proto;
7514 
7515 		/* callee holds a proto refcnt upon success */
7516 		ifnet_lock_shared(ifp);
7517 		proto = find_attached_proto(ifp, proto_fam);
7518 		ifnet_lock_done(ifp);
7519 		if (proto != NULL) {
7520 			proto_media_ioctl ioctlp =
7521 			    (proto->proto_kpi == kProtoKPI_v1 ?
7522 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7523 			result = EOPNOTSUPP;
7524 			if (ioctlp != NULL) {
7525 				result = ioctlp(ifp, proto_fam, ioctl_code,
7526 				    ioctl_arg);
7527 			}
7528 			if_proto_free(proto);
7529 
7530 			/* Only update retval if no one has handled the ioctl */
7531 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7532 				if (result == ENOTSUP) {
7533 					result = EOPNOTSUPP;
7534 				}
7535 				retval = result;
7536 				if (retval && retval != EOPNOTSUPP) {
7537 					goto cleanup;
7538 				}
7539 			}
7540 		}
7541 	}
7542 
7543 	/* retval is either 0 or EOPNOTSUPP */
7544 
7545 	/*
7546 	 * Let the interface handle this ioctl.
7547 	 * If it returns EOPNOTSUPP, ignore that, we may have
7548 	 * already handled this in the protocol or family.
7549 	 */
7550 	if (ifp->if_ioctl) {
7551 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7552 	}
7553 
7554 	/* Only update retval if no one has handled the ioctl */
7555 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7556 		if (result == ENOTSUP) {
7557 			result = EOPNOTSUPP;
7558 		}
7559 		retval = result;
7560 		if (retval && retval != EOPNOTSUPP) {
7561 			goto cleanup;
7562 		}
7563 	}
7564 
7565 cleanup:
7566 	if (retval == EJUSTRETURN) {
7567 		retval = 0;
7568 	}
7569 
7570 	ifnet_decr_iorefcnt(ifp);
7571 
7572 	return retval;
7573 }
7574 
7575 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7576 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7577 {
7578 	errno_t error = 0;
7579 
7580 	if (ifp->if_set_bpf_tap) {
7581 		/* Get an io reference on the interface if it is attached */
7582 		if (!ifnet_is_attached(ifp, 1)) {
7583 			return ENXIO;
7584 		}
7585 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7586 		ifnet_decr_iorefcnt(ifp);
7587 	}
7588 	return error;
7589 }
7590 
7591 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7592 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7593     struct sockaddr *ll_addr, size_t ll_len)
7594 {
7595 	errno_t result = EOPNOTSUPP;
7596 	struct if_proto *proto;
7597 	const struct sockaddr *verify;
7598 	proto_media_resolve_multi resolvep;
7599 
7600 	if (!ifnet_is_attached(ifp, 1)) {
7601 		return result;
7602 	}
7603 
7604 	bzero(ll_addr, ll_len);
7605 
7606 	/* Call the protocol first; callee holds a proto refcnt upon success */
7607 	ifnet_lock_shared(ifp);
7608 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7609 	ifnet_lock_done(ifp);
7610 	if (proto != NULL) {
7611 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7612 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7613 		if (resolvep != NULL) {
7614 			result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7615 		}
7616 		if_proto_free(proto);
7617 	}
7618 
7619 	/* Let the interface verify the multicast address */
7620 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7621 		if (result == 0) {
7622 			verify = ll_addr;
7623 		} else {
7624 			verify = proto_addr;
7625 		}
7626 		result = ifp->if_check_multi(ifp, verify);
7627 	}
7628 
7629 	ifnet_decr_iorefcnt(ifp);
7630 	return result;
7631 }
7632 
7633 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7634 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7635     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7636     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7637 {
7638 	struct if_proto *proto;
7639 	errno_t result = 0;
7640 
7641 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7642 		result = ENOTSUP;
7643 		goto done;
7644 	}
7645 
7646 	/* callee holds a proto refcnt upon success */
7647 	ifnet_lock_shared(ifp);
7648 	proto = find_attached_proto(ifp, target_proto->sa_family);
7649 	ifnet_lock_done(ifp);
7650 	if (proto == NULL) {
7651 		result = ENOTSUP;
7652 	} else {
7653 		proto_media_send_arp    arpp;
7654 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7655 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7656 		if (arpp == NULL) {
7657 			result = ENOTSUP;
7658 		} else {
7659 			switch (arpop) {
7660 			case ARPOP_REQUEST:
7661 				arpstat.txrequests++;
7662 				if (target_hw != NULL) {
7663 					arpstat.txurequests++;
7664 				}
7665 				break;
7666 			case ARPOP_REPLY:
7667 				arpstat.txreplies++;
7668 				break;
7669 			}
7670 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7671 			    target_hw, target_proto);
7672 		}
7673 		if_proto_free(proto);
7674 	}
7675 done:
7676 	return result;
7677 }
7678 
7679 struct net_thread_marks { };
7680 static const struct net_thread_marks net_thread_marks_base = { };
7681 
7682 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7683     &net_thread_marks_base;
7684 
7685 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7686 net_thread_marks_push(u_int32_t push)
7687 {
7688 	static const char *const base = (const void*)&net_thread_marks_base;
7689 	u_int32_t pop = 0;
7690 
7691 	if (push != 0) {
7692 		struct uthread *uth = current_uthread();
7693 
7694 		pop = push & ~uth->uu_network_marks;
7695 		if (pop != 0) {
7696 			uth->uu_network_marks |= pop;
7697 		}
7698 	}
7699 
7700 	return (net_thread_marks_t)&base[pop];
7701 }
7702 
7703 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7704 net_thread_unmarks_push(u_int32_t unpush)
7705 {
7706 	static const char *const base = (const void*)&net_thread_marks_base;
7707 	u_int32_t unpop = 0;
7708 
7709 	if (unpush != 0) {
7710 		struct uthread *uth = current_uthread();
7711 
7712 		unpop = unpush & uth->uu_network_marks;
7713 		if (unpop != 0) {
7714 			uth->uu_network_marks &= ~unpop;
7715 		}
7716 	}
7717 
7718 	return (net_thread_marks_t)&base[unpop];
7719 }
7720 
7721 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7722 net_thread_marks_pop(net_thread_marks_t popx)
7723 {
7724 	static const char *const base = (const void*)&net_thread_marks_base;
7725 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7726 
7727 	if (pop != 0) {
7728 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7729 		struct uthread *uth = current_uthread();
7730 
7731 		VERIFY((pop & ones) == pop);
7732 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7733 		uth->uu_network_marks &= ~pop;
7734 	}
7735 }
7736 
7737 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7738 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7739 {
7740 	static const char *const base = (const void*)&net_thread_marks_base;
7741 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7742 
7743 	if (unpop != 0) {
7744 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7745 		struct uthread *uth = current_uthread();
7746 
7747 		VERIFY((unpop & ones) == unpop);
7748 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7749 		uth->uu_network_marks |= (u_int32_t)unpop;
7750 	}
7751 }
7752 
7753 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7754 net_thread_is_marked(u_int32_t check)
7755 {
7756 	if (check != 0) {
7757 		struct uthread *uth = current_uthread();
7758 		return uth->uu_network_marks & check;
7759 	} else {
7760 		return 0;
7761 	}
7762 }
7763 
7764 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7765 net_thread_is_unmarked(u_int32_t check)
7766 {
7767 	if (check != 0) {
7768 		struct uthread *uth = current_uthread();
7769 		return ~uth->uu_network_marks & check;
7770 	} else {
7771 		return 0;
7772 	}
7773 }
7774 
7775 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7776 _is_announcement(const struct sockaddr_in * sender_sin,
7777     const struct sockaddr_in * target_sin)
7778 {
7779 	if (target_sin == NULL || sender_sin == NULL) {
7780 		return FALSE;
7781 	}
7782 
7783 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7784 }
7785 
7786 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7787 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7788     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7789     const struct sockaddr *target_proto0, u_int32_t rtflags)
7790 {
7791 	errno_t result = 0;
7792 	const struct sockaddr_in * sender_sin;
7793 	const struct sockaddr_in * target_sin;
7794 	struct sockaddr_inarp target_proto_sinarp;
7795 	struct sockaddr *target_proto = __DECONST_SA(target_proto0);
7796 
7797 	if (target_proto == NULL || sender_proto == NULL) {
7798 		return EINVAL;
7799 	}
7800 
7801 	if (sender_proto->sa_family != target_proto->sa_family) {
7802 		return EINVAL;
7803 	}
7804 
7805 	/*
7806 	 * If the target is a (default) router, provide that
7807 	 * information to the send_arp callback routine.
7808 	 */
7809 	if (rtflags & RTF_ROUTER) {
7810 		SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
7811 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7812 		target_proto = SA(&target_proto_sinarp);
7813 	}
7814 
7815 	/*
7816 	 * If this is an ARP request and the target IP is IPv4LL,
7817 	 * send the request on all interfaces.  The exception is
7818 	 * an announcement, which must only appear on the specific
7819 	 * interface.
7820 	 */
7821 	sender_sin = SIN(sender_proto);
7822 	target_sin = SIN(target_proto);
7823 	if (target_proto->sa_family == AF_INET &&
7824 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7825 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7826 	    !_is_announcement(sender_sin, target_sin)) {
7827 		ifnet_t         *__counted_by(count) ifp_list;
7828 		u_int32_t       count;
7829 		u_int32_t       ifp_on;
7830 
7831 		result = ENOTSUP;
7832 
7833 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7834 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7835 				errno_t new_result;
7836 				ifaddr_t source_hw = NULL;
7837 				ifaddr_t source_ip = NULL;
7838 				struct sockaddr_in source_ip_copy;
7839 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7840 
7841 				/*
7842 				 * Only arp on interfaces marked for IPv4LL
7843 				 * ARPing.  This may mean that we don't ARP on
7844 				 * the interface the subnet route points to.
7845 				 */
7846 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7847 					continue;
7848 				}
7849 
7850 				/* Find the source IP address */
7851 				ifnet_lock_shared(cur_ifp);
7852 				source_hw = cur_ifp->if_lladdr;
7853 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7854 				    ifa_link) {
7855 					IFA_LOCK(source_ip);
7856 					if (source_ip->ifa_addr != NULL &&
7857 					    source_ip->ifa_addr->sa_family ==
7858 					    AF_INET) {
7859 						/* Copy the source IP address */
7860 						SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
7861 						IFA_UNLOCK(source_ip);
7862 						break;
7863 					}
7864 					IFA_UNLOCK(source_ip);
7865 				}
7866 
7867 				/* No IP Source, don't arp */
7868 				if (source_ip == NULL) {
7869 					ifnet_lock_done(cur_ifp);
7870 					continue;
7871 				}
7872 
7873 				ifa_addref(source_hw);
7874 				ifnet_lock_done(cur_ifp);
7875 
7876 				/* Send the ARP */
7877 				new_result = dlil_send_arp_internal(cur_ifp,
7878 				    arpop, SDL(source_hw->ifa_addr),
7879 				    SA(&source_ip_copy), NULL,
7880 				    target_proto);
7881 
7882 				ifa_remref(source_hw);
7883 				if (result == ENOTSUP) {
7884 					result = new_result;
7885 				}
7886 			}
7887 			ifnet_list_free_counted_by(ifp_list, count);
7888 		}
7889 	} else {
7890 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7891 		    sender_proto, target_hw, target_proto);
7892 	}
7893 
7894 	return result;
7895 }
7896 
7897 /*
7898  * Caller must hold ifnet head lock.
7899  */
7900 static int
ifnet_lookup(struct ifnet * ifp)7901 ifnet_lookup(struct ifnet *ifp)
7902 {
7903 	struct ifnet *_ifp;
7904 
7905 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7906 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7907 		if (_ifp == ifp) {
7908 			break;
7909 		}
7910 	}
7911 	return _ifp != NULL;
7912 }
7913 
7914 /*
7915  * Caller has to pass a non-zero refio argument to get a
7916  * IO reference count. This will prevent ifnet_detach from
7917  * being called when there are outstanding io reference counts.
7918  */
7919 int
ifnet_is_attached(struct ifnet * ifp,int refio)7920 ifnet_is_attached(struct ifnet *ifp, int refio)
7921 {
7922 	int ret;
7923 
7924 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7925 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7926 		if (refio > 0) {
7927 			ifp->if_refio++;
7928 		}
7929 	}
7930 	lck_mtx_unlock(&ifp->if_ref_lock);
7931 
7932 	return ret;
7933 }
7934 
7935 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7936 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7937 {
7938 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7939 	ifp->if_threads_pending++;
7940 	lck_mtx_unlock(&ifp->if_ref_lock);
7941 }
7942 
7943 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7944 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7945 {
7946 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7947 	VERIFY(ifp->if_threads_pending > 0);
7948 	ifp->if_threads_pending--;
7949 	if (ifp->if_threads_pending == 0) {
7950 		wakeup(&ifp->if_threads_pending);
7951 	}
7952 	lck_mtx_unlock(&ifp->if_ref_lock);
7953 }
7954 
7955 /*
7956  * Caller must ensure the interface is attached; the assumption is that
7957  * there is at least an outstanding IO reference count held already.
7958  * Most callers would call ifnet_is_{attached,data_ready}() instead.
7959  */
7960 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7961 ifnet_incr_iorefcnt(struct ifnet *ifp)
7962 {
7963 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7964 	VERIFY(IF_FULLY_ATTACHED(ifp));
7965 	VERIFY(ifp->if_refio > 0);
7966 	ifp->if_refio++;
7967 	lck_mtx_unlock(&ifp->if_ref_lock);
7968 }
7969 
7970 __attribute__((always_inline))
7971 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7972 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7973 {
7974 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7975 
7976 	VERIFY(ifp->if_refio > 0);
7977 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7978 
7979 	ifp->if_refio--;
7980 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7981 
7982 	/*
7983 	 * if there are no more outstanding io references, wakeup the
7984 	 * ifnet_detach thread if detaching flag is set.
7985 	 */
7986 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7987 		wakeup(&(ifp->if_refio));
7988 	}
7989 }
7990 
7991 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7992 ifnet_decr_iorefcnt(struct ifnet *ifp)
7993 {
7994 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7995 	ifnet_decr_iorefcnt_locked(ifp);
7996 	lck_mtx_unlock(&ifp->if_ref_lock);
7997 }
7998 
7999 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8000 ifnet_datamov_begin(struct ifnet *ifp)
8001 {
8002 	boolean_t ret;
8003 
8004 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8005 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8006 		ifp->if_refio++;
8007 		ifp->if_datamov++;
8008 	}
8009 	lck_mtx_unlock(&ifp->if_ref_lock);
8010 
8011 	DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8012 	return ret;
8013 }
8014 
8015 void
ifnet_datamov_end(struct ifnet * ifp)8016 ifnet_datamov_end(struct ifnet *ifp)
8017 {
8018 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8019 	VERIFY(ifp->if_datamov > 0);
8020 	/*
8021 	 * if there's no more thread moving data, wakeup any
8022 	 * drainers that's blocked waiting for this.
8023 	 */
8024 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8025 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8026 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8027 		wakeup(&(ifp->if_datamov));
8028 	}
8029 	ifnet_decr_iorefcnt_locked(ifp);
8030 	lck_mtx_unlock(&ifp->if_ref_lock);
8031 
8032 	DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8033 }
8034 
8035 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8036 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8037 {
8038 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8039 	ifp->if_refio++;
8040 	if (ifp->if_suspend++ == 0) {
8041 		VERIFY(ifp->if_refflags & IFRF_READY);
8042 		ifp->if_refflags &= ~IFRF_READY;
8043 	}
8044 }
8045 
8046 void
ifnet_datamov_suspend(struct ifnet * ifp)8047 ifnet_datamov_suspend(struct ifnet *ifp)
8048 {
8049 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8050 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8051 	ifnet_datamov_suspend_locked(ifp);
8052 	lck_mtx_unlock(&ifp->if_ref_lock);
8053 }
8054 
8055 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8056 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8057 {
8058 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8059 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8060 	if (ifp->if_suspend > 0) {
8061 		lck_mtx_unlock(&ifp->if_ref_lock);
8062 		return FALSE;
8063 	}
8064 	ifnet_datamov_suspend_locked(ifp);
8065 	lck_mtx_unlock(&ifp->if_ref_lock);
8066 	return TRUE;
8067 }
8068 
8069 void
ifnet_datamov_drain(struct ifnet * ifp)8070 ifnet_datamov_drain(struct ifnet *ifp)
8071 {
8072 	lck_mtx_lock(&ifp->if_ref_lock);
8073 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8074 	/* data movement must already be suspended */
8075 	VERIFY(ifp->if_suspend > 0);
8076 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8077 	ifp->if_drainers++;
8078 	while (ifp->if_datamov != 0) {
8079 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8080 		    if_name(ifp));
8081 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8082 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8083 		    (PZERO - 1), __func__, NULL);
8084 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8085 	}
8086 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8087 	VERIFY(ifp->if_drainers > 0);
8088 	ifp->if_drainers--;
8089 	lck_mtx_unlock(&ifp->if_ref_lock);
8090 
8091 	/* purge the interface queues */
8092 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8093 		if_qflush_snd(ifp, false);
8094 	}
8095 }
8096 
8097 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8098 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8099 {
8100 	ifnet_datamov_suspend(ifp);
8101 	ifnet_datamov_drain(ifp);
8102 }
8103 
8104 void
ifnet_datamov_resume(struct ifnet * ifp)8105 ifnet_datamov_resume(struct ifnet *ifp)
8106 {
8107 	lck_mtx_lock(&ifp->if_ref_lock);
8108 	/* data movement must already be suspended */
8109 	VERIFY(ifp->if_suspend > 0);
8110 	if (--ifp->if_suspend == 0) {
8111 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8112 		ifp->if_refflags |= IFRF_READY;
8113 	}
8114 	ifnet_decr_iorefcnt_locked(ifp);
8115 	lck_mtx_unlock(&ifp->if_ref_lock);
8116 }
8117 
8118 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8119 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8120 {
8121 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8122 	ctrace_t *tr;
8123 	u_int32_t idx;
8124 	u_int16_t *cnt;
8125 
8126 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8127 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8128 		/* NOTREACHED */
8129 	}
8130 
8131 	if (refhold) {
8132 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8133 		tr = dl_if_dbg->dldbg_if_refhold;
8134 	} else {
8135 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8136 		tr = dl_if_dbg->dldbg_if_refrele;
8137 	}
8138 
8139 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8140 	ctrace_record(&tr[idx]);
8141 }
8142 
8143 errno_t
dlil_if_ref(struct ifnet * ifp)8144 dlil_if_ref(struct ifnet *ifp)
8145 {
8146 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8147 
8148 	if (dl_if == NULL) {
8149 		return EINVAL;
8150 	}
8151 
8152 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8153 	++dl_if->dl_if_refcnt;
8154 	if (dl_if->dl_if_refcnt == 0) {
8155 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8156 		/* NOTREACHED */
8157 	}
8158 	if (dl_if->dl_if_trace != NULL) {
8159 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8160 	}
8161 	lck_mtx_unlock(&dl_if->dl_if_lock);
8162 
8163 	return 0;
8164 }
8165 
8166 errno_t
dlil_if_free(struct ifnet * ifp)8167 dlil_if_free(struct ifnet *ifp)
8168 {
8169 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8170 	bool need_release = FALSE;
8171 
8172 	if (dl_if == NULL) {
8173 		return EINVAL;
8174 	}
8175 
8176 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8177 	switch (dl_if->dl_if_refcnt) {
8178 	case 0:
8179 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8180 		/* NOTREACHED */
8181 		break;
8182 	case 1:
8183 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8184 			need_release = TRUE;
8185 		}
8186 		break;
8187 	default:
8188 		break;
8189 	}
8190 	--dl_if->dl_if_refcnt;
8191 	if (dl_if->dl_if_trace != NULL) {
8192 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8193 	}
8194 	lck_mtx_unlock(&dl_if->dl_if_lock);
8195 	if (need_release) {
8196 		_dlil_if_release(ifp, true);
8197 	}
8198 	return 0;
8199 }
8200 
8201 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8202 dlil_attach_protocol(struct if_proto *proto,
8203     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8204     uint32_t * proto_count)
8205 {
8206 	struct kev_dl_proto_data ev_pr_data;
8207 	struct ifnet *ifp = proto->ifp;
8208 	errno_t retval = 0;
8209 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8210 	struct if_proto *prev_proto;
8211 	struct if_proto *_proto;
8212 
8213 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8214 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8215 		return EINVAL;
8216 	}
8217 
8218 	if (!ifnet_is_attached(ifp, 1)) {
8219 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8220 		    __func__, if_name(ifp));
8221 		return ENXIO;
8222 	}
8223 	/* callee holds a proto refcnt upon success */
8224 	ifnet_lock_exclusive(ifp);
8225 	_proto = find_attached_proto(ifp, proto->protocol_family);
8226 	if (_proto != NULL) {
8227 		ifnet_lock_done(ifp);
8228 		if_proto_free(_proto);
8229 		retval = EEXIST;
8230 		goto ioref_done;
8231 	}
8232 
8233 	/*
8234 	 * Call family module add_proto routine so it can refine the
8235 	 * demux descriptors as it wishes.
8236 	 */
8237 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8238 	    demux_count);
8239 	if (retval) {
8240 		ifnet_lock_done(ifp);
8241 		goto ioref_done;
8242 	}
8243 
8244 	/*
8245 	 * Insert the protocol in the hash
8246 	 */
8247 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8248 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8249 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8250 	}
8251 	if (prev_proto) {
8252 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8253 	} else {
8254 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8255 		    proto, next_hash);
8256 	}
8257 
8258 	/* hold a proto refcnt for attach */
8259 	if_proto_ref(proto);
8260 
8261 	/*
8262 	 * The reserved field carries the number of protocol still attached
8263 	 * (subject to change)
8264 	 */
8265 	ev_pr_data.proto_family = proto->protocol_family;
8266 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8267 
8268 	ifnet_lock_done(ifp);
8269 
8270 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8271 	    (struct net_event_data *)&ev_pr_data,
8272 	    sizeof(struct kev_dl_proto_data), FALSE);
8273 	if (proto_count != NULL) {
8274 		*proto_count = ev_pr_data.proto_remaining_count;
8275 	}
8276 ioref_done:
8277 	ifnet_decr_iorefcnt(ifp);
8278 	return retval;
8279 }
8280 
8281 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8282 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8283 {
8284 	/*
8285 	 * A protocol has been attached, mark the interface up.
8286 	 * This used to be done by configd.KernelEventMonitor, but that
8287 	 * is inherently prone to races (rdar://problem/30810208).
8288 	 */
8289 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8290 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8291 	dlil_post_sifflags_msg(ifp);
8292 #if SKYWALK
8293 	switch (protocol) {
8294 	case AF_INET:
8295 	case AF_INET6:
8296 		/* don't attach the flowswitch unless attaching IP */
8297 		dlil_attach_flowswitch_nexus(ifp);
8298 		break;
8299 	default:
8300 		break;
8301 	}
8302 #endif /* SKYWALK */
8303 }
8304 
8305 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8306 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8307     const struct ifnet_attach_proto_param *proto_details)
8308 {
8309 	int retval = 0;
8310 	struct if_proto  *ifproto = NULL;
8311 	uint32_t proto_count = 0;
8312 
8313 	ifnet_head_lock_shared();
8314 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8315 		retval = EINVAL;
8316 		goto end;
8317 	}
8318 	/* Check that the interface is in the global list */
8319 	if (!ifnet_lookup(ifp)) {
8320 		retval = ENXIO;
8321 		goto end;
8322 	}
8323 
8324 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8325 
8326 	/* refcnt held above during lookup */
8327 	ifproto->ifp = ifp;
8328 	ifproto->protocol_family = protocol;
8329 	ifproto->proto_kpi = kProtoKPI_v1;
8330 	ifproto->kpi.v1.input = proto_details->input;
8331 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8332 	ifproto->kpi.v1.event = proto_details->event;
8333 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8334 	ifproto->kpi.v1.detached = proto_details->detached;
8335 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8336 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8337 
8338 	retval = dlil_attach_protocol(ifproto,
8339 	    proto_details->demux_list, proto_details->demux_count,
8340 	    &proto_count);
8341 
8342 end:
8343 	if (retval == EEXIST) {
8344 		/* already attached */
8345 		if (dlil_verbose) {
8346 			DLIL_PRINTF("%s: protocol %d already attached\n",
8347 			    ifp != NULL ? if_name(ifp) : "N/A",
8348 			    protocol);
8349 		}
8350 	} else if (retval != 0) {
8351 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8352 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8353 	} else if (dlil_verbose) {
8354 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8355 		    ifp != NULL ? if_name(ifp) : "N/A",
8356 		    protocol, proto_count);
8357 	}
8358 	ifnet_head_done();
8359 	if (retval == 0) {
8360 		dlil_handle_proto_attach(ifp, protocol);
8361 	} else if (ifproto != NULL) {
8362 		zfree(dlif_proto_zone, ifproto);
8363 	}
8364 	return retval;
8365 }
8366 
8367 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8368 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8369     const struct ifnet_attach_proto_param_v2 *proto_details)
8370 {
8371 	int retval = 0;
8372 	struct if_proto  *ifproto = NULL;
8373 	uint32_t proto_count = 0;
8374 
8375 	ifnet_head_lock_shared();
8376 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8377 		retval = EINVAL;
8378 		goto end;
8379 	}
8380 	/* Check that the interface is in the global list */
8381 	if (!ifnet_lookup(ifp)) {
8382 		retval = ENXIO;
8383 		goto end;
8384 	}
8385 
8386 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8387 
8388 	/* refcnt held above during lookup */
8389 	ifproto->ifp = ifp;
8390 	ifproto->protocol_family = protocol;
8391 	ifproto->proto_kpi = kProtoKPI_v2;
8392 	ifproto->kpi.v2.input = proto_details->input;
8393 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8394 	ifproto->kpi.v2.event = proto_details->event;
8395 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8396 	ifproto->kpi.v2.detached = proto_details->detached;
8397 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8398 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8399 
8400 	retval = dlil_attach_protocol(ifproto,
8401 	    proto_details->demux_list, proto_details->demux_count,
8402 	    &proto_count);
8403 
8404 end:
8405 	if (retval == EEXIST) {
8406 		/* already attached */
8407 		if (dlil_verbose) {
8408 			DLIL_PRINTF("%s: protocol %d already attached\n",
8409 			    ifp != NULL ? if_name(ifp) : "N/A",
8410 			    protocol);
8411 		}
8412 	} else if (retval != 0) {
8413 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8414 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8415 	} else if (dlil_verbose) {
8416 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8417 		    ifp != NULL ? if_name(ifp) : "N/A",
8418 		    protocol, proto_count);
8419 	}
8420 	ifnet_head_done();
8421 	if (retval == 0) {
8422 		dlil_handle_proto_attach(ifp, protocol);
8423 	} else if (ifproto != NULL) {
8424 		zfree(dlif_proto_zone, ifproto);
8425 	}
8426 	return retval;
8427 }
8428 
8429 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8430 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8431 {
8432 	struct if_proto *proto = NULL;
8433 	int     retval = 0;
8434 
8435 	if (ifp == NULL || proto_family == 0) {
8436 		retval = EINVAL;
8437 		goto end;
8438 	}
8439 
8440 	ifnet_lock_exclusive(ifp);
8441 	/* callee holds a proto refcnt upon success */
8442 	proto = find_attached_proto(ifp, proto_family);
8443 	if (proto == NULL) {
8444 		retval = ENXIO;
8445 		ifnet_lock_done(ifp);
8446 		goto end;
8447 	}
8448 
8449 	/* call family module del_proto */
8450 	if (ifp->if_del_proto) {
8451 		ifp->if_del_proto(ifp, proto->protocol_family);
8452 	}
8453 
8454 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8455 	    proto, if_proto, next_hash);
8456 
8457 	if (proto->proto_kpi == kProtoKPI_v1) {
8458 		proto->kpi.v1.input = ifproto_media_input_v1;
8459 		proto->kpi.v1.pre_output = ifproto_media_preout;
8460 		proto->kpi.v1.event = ifproto_media_event;
8461 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8462 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8463 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8464 	} else {
8465 		proto->kpi.v2.input = ifproto_media_input_v2;
8466 		proto->kpi.v2.pre_output = ifproto_media_preout;
8467 		proto->kpi.v2.event = ifproto_media_event;
8468 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8469 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8470 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8471 	}
8472 	proto->detached = 1;
8473 	ifnet_lock_done(ifp);
8474 
8475 	if (dlil_verbose) {
8476 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8477 		    (proto->proto_kpi == kProtoKPI_v1) ?
8478 		    "v1" : "v2", proto_family);
8479 	}
8480 
8481 	/* release proto refcnt held during protocol attach */
8482 	if_proto_free(proto);
8483 
8484 	/*
8485 	 * Release proto refcnt held during lookup; the rest of
8486 	 * protocol detach steps will happen when the last proto
8487 	 * reference is released.
8488 	 */
8489 	if_proto_free(proto);
8490 
8491 end:
8492 	return retval;
8493 }
8494 
8495 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8496 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8497     struct mbuf *packet, char *header)
8498 {
8499 #pragma unused(ifp, protocol, packet, header)
8500 	return ENXIO;
8501 }
8502 
8503 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8504 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8505     struct mbuf *packet)
8506 {
8507 #pragma unused(ifp, protocol, packet)
8508 	return ENXIO;
8509 }
8510 
8511 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8512 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8513     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8514     char *link_layer_dest)
8515 {
8516 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8517 	return ENXIO;
8518 }
8519 
8520 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8521 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8522     const struct kev_msg *event)
8523 {
8524 #pragma unused(ifp, protocol, event)
8525 }
8526 
8527 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8528 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8529     unsigned long command, void *argument)
8530 {
8531 #pragma unused(ifp, protocol, command, argument)
8532 	return ENXIO;
8533 }
8534 
8535 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8536 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8537     struct sockaddr_dl *out_ll, size_t ll_len)
8538 {
8539 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8540 	return ENXIO;
8541 }
8542 
8543 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8544 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8545     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8546     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8547 {
8548 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8549 	return ENXIO;
8550 }
8551 
8552 extern int if_next_index(void);
8553 extern int tcp_ecn_outbound;
8554 
8555 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8556 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8557 {
8558 	uint32_t sflags = 0;
8559 	int err;
8560 
8561 	if (if_flowadv) {
8562 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8563 	}
8564 
8565 	if (if_delaybased_queue) {
8566 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8567 	}
8568 
8569 	if (ifp->if_output_sched_model ==
8570 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8571 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8572 	}
8573 	/* Inherit drop limit from the default queue */
8574 	if (ifp->if_snd != ifcq) {
8575 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8576 	}
8577 	/* Initialize transmit queue(s) */
8578 	err = ifclassq_setup(ifcq, ifp, sflags);
8579 	if (err != 0) {
8580 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8581 		    "err=%d", __func__, ifp, err);
8582 		/* NOTREACHED */
8583 	}
8584 }
8585 
8586 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8587 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8588 {
8589 #if SKYWALK
8590 	boolean_t netif_compat;
8591 	if_nexus_netif  nexus_netif;
8592 #endif /* SKYWALK */
8593 	struct ifnet *tmp_if;
8594 	struct ifaddr *ifa;
8595 	struct if_data_internal if_data_saved;
8596 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8597 	struct dlil_threading_info *dl_inp;
8598 	thread_continue_t thfunc = NULL;
8599 	int err;
8600 
8601 	if (ifp == NULL) {
8602 		return EINVAL;
8603 	}
8604 
8605 	/*
8606 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8607 	 * prevent the interface from being configured while it is
8608 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8609 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8610 	 */
8611 	dlil_if_lock();
8612 	ifnet_head_lock_exclusive();
8613 	/* Verify we aren't already on the list */
8614 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8615 		if (tmp_if == ifp) {
8616 			ifnet_head_done();
8617 			dlil_if_unlock();
8618 			return EEXIST;
8619 		}
8620 	}
8621 
8622 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8623 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8624 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8625 		    __func__, ifp);
8626 		/* NOTREACHED */
8627 	}
8628 	lck_mtx_unlock(&ifp->if_ref_lock);
8629 
8630 	ifnet_lock_exclusive(ifp);
8631 
8632 	/* Sanity check */
8633 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8634 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8635 	VERIFY(ifp->if_threads_pending == 0);
8636 
8637 	if (ll_addr != NULL) {
8638 		if (ifp->if_addrlen == 0) {
8639 			ifp->if_addrlen = ll_addr->sdl_alen;
8640 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8641 			ifnet_lock_done(ifp);
8642 			ifnet_head_done();
8643 			dlil_if_unlock();
8644 			return EINVAL;
8645 		}
8646 	}
8647 
8648 	/*
8649 	 * Allow interfaces without protocol families to attach
8650 	 * only if they have the necessary fields filled out.
8651 	 */
8652 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8653 		DLIL_PRINTF("%s: Attempt to attach interface without "
8654 		    "family module - %d\n", __func__, ifp->if_family);
8655 		ifnet_lock_done(ifp);
8656 		ifnet_head_done();
8657 		dlil_if_unlock();
8658 		return ENODEV;
8659 	}
8660 
8661 	/* Allocate protocol hash table */
8662 	VERIFY(ifp->if_proto_hash == NULL);
8663 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8664 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8665 
8666 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8667 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8668 	TAILQ_INIT(&ifp->if_flt_head);
8669 	VERIFY(ifp->if_flt_busy == 0);
8670 	VERIFY(ifp->if_flt_waiters == 0);
8671 	VERIFY(ifp->if_flt_non_os_count == 0);
8672 	VERIFY(ifp->if_flt_no_tso_count == 0);
8673 	lck_mtx_unlock(&ifp->if_flt_lock);
8674 
8675 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8676 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8677 		LIST_INIT(&ifp->if_multiaddrs);
8678 	}
8679 
8680 	VERIFY(ifp->if_allhostsinm == NULL);
8681 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8682 	TAILQ_INIT(&ifp->if_addrhead);
8683 
8684 	if (ifp->if_index == 0) {
8685 		int idx = if_next_index();
8686 
8687 		/*
8688 		 * Since we exhausted the list of
8689 		 * if_index's, try to find an empty slot
8690 		 * in ifindex2ifnet.
8691 		 */
8692 		if (idx == -1 && if_index >= UINT16_MAX) {
8693 			for (int i = 1; i < if_index; i++) {
8694 				if (ifindex2ifnet[i] == NULL &&
8695 				    ifnet_addrs[i - 1] == NULL) {
8696 					idx = i;
8697 					break;
8698 				}
8699 			}
8700 		}
8701 		if (idx == -1) {
8702 			ifp->if_index = 0;
8703 			ifnet_lock_done(ifp);
8704 			ifnet_head_done();
8705 			dlil_if_unlock();
8706 			return ENOBUFS;
8707 		}
8708 		ifp->if_index = (uint16_t)idx;
8709 
8710 		/* the lladdr passed at attach time is the permanent address */
8711 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8712 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8713 			bcopy(CONST_LLADDR(ll_addr),
8714 			    dl_if->dl_if_permanent_ether,
8715 			    ETHER_ADDR_LEN);
8716 			dl_if->dl_if_permanent_ether_is_set = 1;
8717 		}
8718 	}
8719 	/* There should not be anything occupying this slot */
8720 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8721 
8722 	/* allocate (if needed) and initialize a link address */
8723 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8724 	if (ifa == NULL) {
8725 		ifnet_lock_done(ifp);
8726 		ifnet_head_done();
8727 		dlil_if_unlock();
8728 		return ENOBUFS;
8729 	}
8730 
8731 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8732 	ifnet_addrs[ifp->if_index - 1] = ifa;
8733 
8734 	/* make this address the first on the list */
8735 	IFA_LOCK(ifa);
8736 	/* hold a reference for ifnet_addrs[] */
8737 	ifa_addref(ifa);
8738 	/* if_attach_link_ifa() holds a reference for ifa_link */
8739 	if_attach_link_ifa(ifp, ifa);
8740 	IFA_UNLOCK(ifa);
8741 
8742 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8743 	ifindex2ifnet[ifp->if_index] = ifp;
8744 
8745 	/* Hold a reference to the underlying dlil_ifnet */
8746 	ifnet_reference(ifp);
8747 
8748 	/* Clear stats (save and restore other fields that we care) */
8749 	if_data_saved = ifp->if_data;
8750 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8751 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8752 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8753 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8754 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8755 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8756 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8757 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8758 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8759 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8760 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8761 	ifnet_touch_lastchange(ifp);
8762 
8763 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8764 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8765 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8766 
8767 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8768 
8769 	/* Sanity checks on the input thread storage */
8770 	dl_inp = &dl_if->dl_if_inpstorage;
8771 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8772 	VERIFY(dl_inp->dlth_flags == 0);
8773 	VERIFY(dl_inp->dlth_wtot == 0);
8774 	VERIFY(dl_inp->dlth_ifp == NULL);
8775 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8776 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8777 	VERIFY(!dl_inp->dlth_affinity);
8778 	VERIFY(ifp->if_inp == NULL);
8779 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8780 	VERIFY(dl_inp->dlth_strategy == NULL);
8781 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8782 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8783 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8784 
8785 #if IFNET_INPUT_SANITY_CHK
8786 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8787 #endif /* IFNET_INPUT_SANITY_CHK */
8788 
8789 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8790 	dlil_reset_rxpoll_params(ifp);
8791 	/*
8792 	 * A specific DLIL input thread is created per non-loopback interface.
8793 	 */
8794 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8795 		ifp->if_inp = dl_inp;
8796 		ifnet_incr_pending_thread_count(ifp);
8797 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8798 		if (err == ENODEV) {
8799 			VERIFY(thfunc == NULL);
8800 			ifnet_decr_pending_thread_count(ifp);
8801 		} else if (err != 0) {
8802 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8803 			    "err=%d", __func__, ifp, err);
8804 			/* NOTREACHED */
8805 		}
8806 	}
8807 	/*
8808 	 * If the driver supports the new transmit model, calculate flow hash
8809 	 * and create a workloop starter thread to invoke the if_start callback
8810 	 * where the packets may be dequeued and transmitted.
8811 	 */
8812 	if (ifp->if_eflags & IFEF_TXSTART) {
8813 		thread_precedence_policy_data_t info;
8814 		__unused kern_return_t kret;
8815 
8816 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8817 		VERIFY(ifp->if_flowhash != 0);
8818 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8819 
8820 		ifnet_set_start_cycle(ifp, NULL);
8821 		ifp->if_start_active = 0;
8822 		ifp->if_start_req = 0;
8823 		ifp->if_start_flags = 0;
8824 		VERIFY(ifp->if_start != NULL);
8825 		ifnet_incr_pending_thread_count(ifp);
8826 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8827 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8828 			panic_plain("%s: "
8829 			    "ifp=%p couldn't get a start thread; "
8830 			    "err=%d", __func__, ifp, err);
8831 			/* NOTREACHED */
8832 		}
8833 		bzero(&info, sizeof(info));
8834 		info.importance = 1;
8835 		kret = thread_policy_set(ifp->if_start_thread,
8836 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8837 		    THREAD_PRECEDENCE_POLICY_COUNT);
8838 		ASSERT(kret == KERN_SUCCESS);
8839 	} else {
8840 		ifp->if_flowhash = 0;
8841 	}
8842 
8843 	/* Reset polling parameters */
8844 	ifnet_set_poll_cycle(ifp, NULL);
8845 	ifp->if_poll_update = 0;
8846 	ifp->if_poll_flags = 0;
8847 	ifp->if_poll_req = 0;
8848 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8849 
8850 	/*
8851 	 * If the driver supports the new receive model, create a poller
8852 	 * thread to invoke if_input_poll callback where the packets may
8853 	 * be dequeued from the driver and processed for reception.
8854 	 * if the interface is netif compat then the poller thread is
8855 	 * managed by netif.
8856 	 */
8857 	if (thfunc == dlil_rxpoll_input_thread_func) {
8858 		thread_precedence_policy_data_t info;
8859 		__unused kern_return_t kret;
8860 #if SKYWALK
8861 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8862 #endif /* SKYWALK */
8863 		VERIFY(ifp->if_input_poll != NULL);
8864 		VERIFY(ifp->if_input_ctl != NULL);
8865 		ifnet_incr_pending_thread_count(ifp);
8866 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8867 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8868 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8869 			    "err=%d", __func__, ifp, err);
8870 			/* NOTREACHED */
8871 		}
8872 		bzero(&info, sizeof(info));
8873 		info.importance = 1;
8874 		kret = thread_policy_set(ifp->if_poll_thread,
8875 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8876 		    THREAD_PRECEDENCE_POLICY_COUNT);
8877 		ASSERT(kret == KERN_SUCCESS);
8878 	}
8879 
8880 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8881 	VERIFY(ifp->if_desc.ifd_len == 0);
8882 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8883 
8884 	/* Record attach PC stacktrace */
8885 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8886 
8887 	ifp->if_updatemcasts = 0;
8888 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8889 		struct ifmultiaddr *ifma;
8890 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8891 			IFMA_LOCK(ifma);
8892 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8893 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8894 				ifp->if_updatemcasts++;
8895 			}
8896 			IFMA_UNLOCK(ifma);
8897 		}
8898 
8899 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8900 		    "membership(s)\n", if_name(ifp),
8901 		    ifp->if_updatemcasts);
8902 	}
8903 
8904 	/* Clear logging parameters */
8905 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8906 
8907 	/* Clear foreground/realtime activity timestamps */
8908 	ifp->if_fg_sendts = 0;
8909 	ifp->if_rt_sendts = 0;
8910 
8911 	/* Clear throughput estimates and radio type */
8912 	ifp->if_estimated_up_bucket = 0;
8913 	ifp->if_estimated_down_bucket = 0;
8914 	ifp->if_radio_type = 0;
8915 	ifp->if_radio_channel = 0;
8916 
8917 	VERIFY(ifp->if_delegated.ifp == NULL);
8918 	VERIFY(ifp->if_delegated.type == 0);
8919 	VERIFY(ifp->if_delegated.family == 0);
8920 	VERIFY(ifp->if_delegated.subfamily == 0);
8921 	VERIFY(ifp->if_delegated.expensive == 0);
8922 	VERIFY(ifp->if_delegated.constrained == 0);
8923 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
8924 
8925 	VERIFY(ifp->if_agentids == NULL);
8926 	VERIFY(ifp->if_agentcount == 0);
8927 
8928 	/* Reset interface state */
8929 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8930 	ifp->if_interface_state.valid_bitmask |=
8931 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8932 	ifp->if_interface_state.interface_availability =
8933 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8934 
8935 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8936 	if (ifp == lo_ifp) {
8937 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8938 		ifp->if_interface_state.valid_bitmask |=
8939 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8940 	} else {
8941 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8942 	}
8943 
8944 	/*
8945 	 * Enable ECN capability on this interface depending on the
8946 	 * value of ECN global setting
8947 	 */
8948 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8949 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8950 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8951 	}
8952 
8953 	/*
8954 	 * Built-in Cyclops always on policy for WiFi infra
8955 	 */
8956 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8957 		errno_t error;
8958 
8959 		error = if_set_qosmarking_mode(ifp,
8960 		    IFRTYPE_QOSMARKING_FASTLANE);
8961 		if (error != 0) {
8962 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8963 			    __func__, ifp->if_xname, error);
8964 		} else {
8965 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8966 #if (DEVELOPMENT || DEBUG)
8967 			DLIL_PRINTF("%s fastlane enabled on %s\n",
8968 			    __func__, ifp->if_xname);
8969 #endif /* (DEVELOPMENT || DEBUG) */
8970 		}
8971 	}
8972 
8973 	ifnet_lock_done(ifp);
8974 	ifnet_head_done();
8975 
8976 #if SKYWALK
8977 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8978 #endif /* SKYWALK */
8979 
8980 	lck_mtx_lock(&ifp->if_cached_route_lock);
8981 	/* Enable forwarding cached route */
8982 	ifp->if_fwd_cacheok = 1;
8983 	/* Clean up any existing cached routes */
8984 	ROUTE_RELEASE(&ifp->if_fwd_route);
8985 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8986 	ROUTE_RELEASE(&ifp->if_src_route);
8987 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8988 	ROUTE_RELEASE(&ifp->if_src_route6);
8989 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8990 	lck_mtx_unlock(&ifp->if_cached_route_lock);
8991 
8992 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8993 
8994 	/*
8995 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8996 	 * and trees; do this before the ifnet is marked as attached.
8997 	 * The ifnet keeps the reference to the info structures even after
8998 	 * the ifnet is detached, since the network-layer records still
8999 	 * refer to the info structures even after that.  This also
9000 	 * makes it possible for them to still function after the ifnet
9001 	 * is recycled or reattached.
9002 	 */
9003 #if INET
9004 	if (IGMP_IFINFO(ifp) == NULL) {
9005 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9006 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9007 	} else {
9008 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9009 		igmp_domifreattach(IGMP_IFINFO(ifp));
9010 	}
9011 #endif /* INET */
9012 	if (MLD_IFINFO(ifp) == NULL) {
9013 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9014 		VERIFY(MLD_IFINFO(ifp) != NULL);
9015 	} else {
9016 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9017 		mld_domifreattach(MLD_IFINFO(ifp));
9018 	}
9019 
9020 	VERIFY(ifp->if_data_threshold == 0);
9021 	VERIFY(ifp->if_dt_tcall != NULL);
9022 
9023 	/*
9024 	 * Wait for the created kernel threads for I/O to get
9025 	 * scheduled and run at least once before we proceed
9026 	 * to mark interface as attached.
9027 	 */
9028 	lck_mtx_lock(&ifp->if_ref_lock);
9029 	while (ifp->if_threads_pending != 0) {
9030 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9031 		    "interface %s to get scheduled at least once.\n",
9032 		    __func__, ifp->if_xname);
9033 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9034 		    __func__, NULL);
9035 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9036 	}
9037 	lck_mtx_unlock(&ifp->if_ref_lock);
9038 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9039 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9040 
9041 	/* Final mark this ifnet as attached. */
9042 	ifnet_lock_exclusive(ifp);
9043 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9044 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9045 	lck_mtx_unlock(&ifp->if_ref_lock);
9046 	if (net_rtref) {
9047 		/* boot-args override; enable idle notification */
9048 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9049 		    IFRF_IDLE_NOTIFY);
9050 	} else {
9051 		/* apply previous request(s) to set the idle flags, if any */
9052 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9053 		    ifp->if_idle_new_flags_mask);
9054 	}
9055 #if SKYWALK
9056 	/* the interface is fully attached; let the nexus adapter know */
9057 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9058 		if (netif_compat) {
9059 			if (sk_netif_compat_txmodel ==
9060 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9061 				ifnet_enqueue_multi_setup(ifp,
9062 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9063 			}
9064 			ifp->if_nx_netif = nexus_netif;
9065 		}
9066 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9067 	}
9068 #endif /* SKYWALK */
9069 	ifnet_lock_done(ifp);
9070 	dlil_if_unlock();
9071 
9072 #if PF
9073 	/*
9074 	 * Attach packet filter to this interface, if enabled.
9075 	 */
9076 	pf_ifnet_hook(ifp, 1);
9077 #endif /* PF */
9078 
9079 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9080 
9081 	if (dlil_verbose) {
9082 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9083 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9084 	}
9085 
9086 	return 0;
9087 }
9088 
9089 /*
9090  * Prepare the storage for the first/permanent link address, which must
9091  * must have the same lifetime as the ifnet itself.  Although the link
9092  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9093  * its location in memory must never change as it may still be referred
9094  * to by some parts of the system afterwards (unfortunate implementation
9095  * artifacts inherited from BSD.)
9096  *
9097  * Caller must hold ifnet lock as writer.
9098  */
9099 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9100 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9101 {
9102 	struct ifaddr *ifa, *oifa = NULL;
9103 	struct sockaddr_dl *addr_sdl, *mask_sdl;
9104 	char workbuf[IFNAMSIZ * 2];
9105 	int namelen, masklen, socksize;
9106 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9107 
9108 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9109 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9110 
9111 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9112 	    if_name(ifp));
9113 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9114 	    + ((namelen > 0) ? namelen : 0);
9115 	socksize = masklen + ifp->if_addrlen;
9116 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9117 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9118 		socksize = sizeof(struct sockaddr_dl);
9119 	}
9120 	socksize = ROUNDUP(socksize);
9121 #undef ROUNDUP
9122 
9123 	ifa = ifp->if_lladdr;
9124 	if (socksize > DLIL_SDLMAXLEN ||
9125 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9126 		/*
9127 		 * Rare, but in the event that the link address requires
9128 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9129 		 * largest possible storages for address and mask, such
9130 		 * that we can reuse the same space when if_addrlen grows.
9131 		 * This same space will be used when if_addrlen shrinks.
9132 		 */
9133 		struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9134 
9135 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9136 			dl_if_lladdr_ext = zalloc_permanent(
9137 				sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9138 
9139 			ifa = &dl_if_lladdr_ext->ifa;
9140 			ifa_lock_init(ifa);
9141 			ifa_initref(ifa);
9142 			/* Don't set IFD_ALLOC, as this is permanent */
9143 			ifa->ifa_debug = IFD_LINK;
9144 		} else {
9145 			dl_if_lladdr_ext = __unsafe_forge_single(
9146 				struct dl_if_lladdr_xtra_space*, ifa);
9147 			ifa = &dl_if_lladdr_ext->ifa;
9148 		}
9149 
9150 		IFA_LOCK(ifa);
9151 		/* address and mask sockaddr_dl locations */
9152 		bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9153 		    sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9154 		bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9155 		    sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9156 		addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9157 		mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9158 	} else {
9159 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9160 		/*
9161 		 * Use the storage areas for address and mask within the
9162 		 * dlil_ifnet structure.  This is the most common case.
9163 		 */
9164 		if (ifa == NULL) {
9165 			ifa = &dl_if->dl_if_lladdr.ifa;
9166 			ifa_lock_init(ifa);
9167 			ifa_initref(ifa);
9168 			/* Don't set IFD_ALLOC, as this is permanent */
9169 			ifa->ifa_debug = IFD_LINK;
9170 		}
9171 		IFA_LOCK(ifa);
9172 		/* address and mask sockaddr_dl locations */
9173 		bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9174 		    sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9175 		bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9176 		    sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9177 		addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9178 		mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9179 	}
9180 
9181 	if (ifp->if_lladdr != ifa) {
9182 		oifa = ifp->if_lladdr;
9183 		ifp->if_lladdr = ifa;
9184 	}
9185 
9186 	VERIFY(ifa->ifa_debug == IFD_LINK);
9187 	ifa->ifa_ifp = ifp;
9188 	ifa->ifa_rtrequest = link_rtrequest;
9189 	ifa->ifa_addr = SA(addr_sdl);
9190 	addr_sdl->sdl_len = (u_char)socksize;
9191 	addr_sdl->sdl_family = AF_LINK;
9192 	if (namelen > 0) {
9193 		bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9194 		    sizeof(addr_sdl->sdl_data)));
9195 		addr_sdl->sdl_nlen = (u_char)namelen;
9196 	} else {
9197 		addr_sdl->sdl_nlen = 0;
9198 	}
9199 	addr_sdl->sdl_index = ifp->if_index;
9200 	addr_sdl->sdl_type = ifp->if_type;
9201 	if (ll_addr != NULL) {
9202 		addr_sdl->sdl_alen = ll_addr->sdl_alen;
9203 		bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9204 	} else {
9205 		addr_sdl->sdl_alen = 0;
9206 	}
9207 	ifa->ifa_netmask = SA(mask_sdl);
9208 	mask_sdl->sdl_len = (u_char)masklen;
9209 	while (namelen > 0) {
9210 		mask_sdl->sdl_data[--namelen] = 0xff;
9211 	}
9212 	IFA_UNLOCK(ifa);
9213 
9214 	if (oifa != NULL) {
9215 		ifa_remref(oifa);
9216 	}
9217 
9218 	return ifa;
9219 }
9220 
9221 static void
if_purgeaddrs(struct ifnet * ifp)9222 if_purgeaddrs(struct ifnet *ifp)
9223 {
9224 #if INET
9225 	in_purgeaddrs(ifp);
9226 #endif /* INET */
9227 	in6_purgeaddrs(ifp);
9228 }
9229 
9230 errno_t
ifnet_detach(ifnet_t ifp)9231 ifnet_detach(ifnet_t ifp)
9232 {
9233 	struct ifnet *delegated_ifp;
9234 	struct nd_ifinfo *ndi = NULL;
9235 
9236 	if (ifp == NULL) {
9237 		return EINVAL;
9238 	}
9239 
9240 	ndi = ND_IFINFO(ifp);
9241 	if (NULL != ndi) {
9242 		ndi->cga_initialized = FALSE;
9243 	}
9244 
9245 	/* Mark the interface down */
9246 	if_down(ifp);
9247 
9248 	/*
9249 	 * IMPORTANT NOTE
9250 	 *
9251 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9252 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9253 	 * until after we've waited for all I/O references to drain
9254 	 * in ifnet_detach_final().
9255 	 */
9256 
9257 	ifnet_head_lock_exclusive();
9258 	ifnet_lock_exclusive(ifp);
9259 
9260 	if (ifp->if_output_netem != NULL) {
9261 		netem_destroy(ifp->if_output_netem);
9262 		ifp->if_output_netem = NULL;
9263 	}
9264 
9265 	/*
9266 	 * Check to see if this interface has previously triggered
9267 	 * aggressive protocol draining; if so, decrement the global
9268 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9269 	 * there are no more of such an interface around.
9270 	 */
9271 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9272 
9273 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9274 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9275 		lck_mtx_unlock(&ifp->if_ref_lock);
9276 		ifnet_lock_done(ifp);
9277 		ifnet_head_done();
9278 		return EINVAL;
9279 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9280 		/* Interface has already been detached */
9281 		lck_mtx_unlock(&ifp->if_ref_lock);
9282 		ifnet_lock_done(ifp);
9283 		ifnet_head_done();
9284 		return ENXIO;
9285 	}
9286 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9287 	/* Indicate this interface is being detached */
9288 	ifp->if_refflags &= ~IFRF_ATTACHED;
9289 	ifp->if_refflags |= IFRF_DETACHING;
9290 	lck_mtx_unlock(&ifp->if_ref_lock);
9291 
9292 	if (dlil_verbose) {
9293 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9294 	}
9295 
9296 	/* clean up flow control entry object if there's any */
9297 	if (ifp->if_eflags & IFEF_TXSTART) {
9298 		ifnet_flowadv(ifp->if_flowhash);
9299 	}
9300 
9301 	/* Reset ECN enable/disable flags */
9302 	/* Reset CLAT46 flag */
9303 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9304 
9305 	/*
9306 	 * We do not reset the TCP keep alive counters in case
9307 	 * a TCP connection stays connection after the interface
9308 	 * went down
9309 	 */
9310 	if (ifp->if_tcp_kao_cnt > 0) {
9311 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9312 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9313 	}
9314 	ifp->if_tcp_kao_max = 0;
9315 
9316 	/*
9317 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9318 	 * no longer be visible during lookups from this point.
9319 	 */
9320 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9321 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9322 	ifp->if_link.tqe_next = NULL;
9323 	ifp->if_link.tqe_prev = NULL;
9324 	if (ifp->if_ordered_link.tqe_next != NULL ||
9325 	    ifp->if_ordered_link.tqe_prev != NULL) {
9326 		ifnet_remove_from_ordered_list(ifp);
9327 	}
9328 	ifindex2ifnet[ifp->if_index] = NULL;
9329 
9330 	/* 18717626 - reset router mode */
9331 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9332 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9333 
9334 	/* Record detach PC stacktrace */
9335 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9336 
9337 	/* Clear logging parameters */
9338 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9339 
9340 	/* Clear delegated interface info (reference released below) */
9341 	delegated_ifp = ifp->if_delegated.ifp;
9342 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9343 
9344 	/* Reset interface state */
9345 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9346 
9347 	/*
9348 	 * Increment the generation count on interface deletion
9349 	 */
9350 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9351 
9352 	ifnet_lock_done(ifp);
9353 	ifnet_head_done();
9354 
9355 	/* Release reference held on the delegated interface */
9356 	if (delegated_ifp != NULL) {
9357 		ifnet_release(delegated_ifp);
9358 	}
9359 
9360 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9361 	if (ifp != lo_ifp) {
9362 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9363 	}
9364 
9365 	/* Reset TCP local statistics */
9366 	if (ifp->if_tcp_stat != NULL) {
9367 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9368 	}
9369 
9370 	/* Reset UDP local statistics */
9371 	if (ifp->if_udp_stat != NULL) {
9372 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9373 	}
9374 
9375 	/* Reset ifnet IPv4 stats */
9376 	if (ifp->if_ipv4_stat != NULL) {
9377 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9378 	}
9379 
9380 	/* Reset ifnet IPv6 stats */
9381 	if (ifp->if_ipv6_stat != NULL) {
9382 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9383 	}
9384 
9385 	/* Release memory held for interface link status report */
9386 	if (ifp->if_link_status != NULL) {
9387 		kfree_type(struct if_link_status, ifp->if_link_status);
9388 		ifp->if_link_status = NULL;
9389 	}
9390 
9391 	/* Disable forwarding cached route */
9392 	lck_mtx_lock(&ifp->if_cached_route_lock);
9393 	ifp->if_fwd_cacheok = 0;
9394 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9395 
9396 	/* Disable data threshold and wait for any pending event posting */
9397 	ifp->if_data_threshold = 0;
9398 	VERIFY(ifp->if_dt_tcall != NULL);
9399 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9400 
9401 	/*
9402 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9403 	 * references to the info structures and leave them attached to
9404 	 * this ifnet.
9405 	 */
9406 #if INET
9407 	igmp_domifdetach(ifp);
9408 #endif /* INET */
9409 	mld_domifdetach(ifp);
9410 
9411 #if SKYWALK
9412 	/* Clean up any netns tokens still pointing to to this ifnet */
9413 	netns_ifnet_detach(ifp);
9414 #endif /* SKYWALK */
9415 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9416 
9417 	/* Let worker thread take care of the rest, to avoid reentrancy */
9418 	dlil_if_lock();
9419 	ifnet_detaching_enqueue(ifp);
9420 	dlil_if_unlock();
9421 
9422 	return 0;
9423 }
9424 
9425 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9426 ifnet_detaching_enqueue(struct ifnet *ifp)
9427 {
9428 	dlil_if_lock_assert();
9429 
9430 	++ifnet_detaching_cnt;
9431 	VERIFY(ifnet_detaching_cnt != 0);
9432 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9433 	wakeup((caddr_t)&ifnet_delayed_run);
9434 }
9435 
9436 static struct ifnet *
ifnet_detaching_dequeue(void)9437 ifnet_detaching_dequeue(void)
9438 {
9439 	struct ifnet *ifp;
9440 
9441 	dlil_if_lock_assert();
9442 
9443 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9444 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9445 	if (ifp != NULL) {
9446 		VERIFY(ifnet_detaching_cnt != 0);
9447 		--ifnet_detaching_cnt;
9448 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9449 		ifp->if_detaching_link.tqe_next = NULL;
9450 		ifp->if_detaching_link.tqe_prev = NULL;
9451 	}
9452 	return ifp;
9453 }
9454 
9455 __attribute__((noreturn))
9456 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9457 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9458 {
9459 #pragma unused(v, wres)
9460 	struct ifnet *ifp;
9461 
9462 	dlil_if_lock();
9463 	if (__improbable(ifnet_detaching_embryonic)) {
9464 		ifnet_detaching_embryonic = FALSE;
9465 		/* there's no lock ordering constrain so OK to do this here */
9466 		dlil_decr_pending_thread_count();
9467 	}
9468 
9469 	for (;;) {
9470 		dlil_if_lock_assert();
9471 
9472 		if (ifnet_detaching_cnt == 0) {
9473 			break;
9474 		}
9475 
9476 		net_update_uptime();
9477 
9478 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9479 
9480 		/* Take care of detaching ifnet */
9481 		ifp = ifnet_detaching_dequeue();
9482 		if (ifp != NULL) {
9483 			dlil_if_unlock();
9484 			ifnet_detach_final(ifp);
9485 			dlil_if_lock();
9486 		}
9487 	}
9488 
9489 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9490 	dlil_if_unlock();
9491 	(void) thread_block(ifnet_detacher_thread_cont);
9492 
9493 	VERIFY(0);      /* we should never get here */
9494 	/* NOTREACHED */
9495 	__builtin_unreachable();
9496 }
9497 
9498 __dead2
9499 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9500 ifnet_detacher_thread_func(void *v, wait_result_t w)
9501 {
9502 #pragma unused(v, w)
9503 	dlil_if_lock();
9504 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9505 	ifnet_detaching_embryonic = TRUE;
9506 	/* wake up once to get out of embryonic state */
9507 	wakeup((caddr_t)&ifnet_delayed_run);
9508 	dlil_if_unlock();
9509 	(void) thread_block(ifnet_detacher_thread_cont);
9510 	VERIFY(0);
9511 	/* NOTREACHED */
9512 	__builtin_unreachable();
9513 }
9514 
9515 static void
ifnet_detach_final(struct ifnet * ifp)9516 ifnet_detach_final(struct ifnet *ifp)
9517 {
9518 	struct ifnet_filter *filter, *filter_next;
9519 	struct dlil_ifnet *dlifp;
9520 	struct ifnet_filter_head fhead;
9521 	struct dlil_threading_info *inp;
9522 	struct ifaddr *ifa;
9523 	ifnet_detached_func if_free;
9524 	int i;
9525 
9526 	/* Let BPF know we're detaching */
9527 	bpfdetach(ifp);
9528 
9529 #if SKYWALK
9530 	dlil_netif_detach_notify(ifp);
9531 	/*
9532 	 * Wait for the datapath to quiesce before tearing down
9533 	 * netif/flowswitch nexuses.
9534 	 */
9535 	dlil_quiesce_and_detach_nexuses(ifp);
9536 #endif /* SKYWALK */
9537 
9538 	lck_mtx_lock(&ifp->if_ref_lock);
9539 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9540 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9541 		    __func__, ifp);
9542 		/* NOTREACHED */
9543 	}
9544 
9545 	/*
9546 	 * Wait until the existing IO references get released
9547 	 * before we proceed with ifnet_detach.  This is not a
9548 	 * common case, so block without using a continuation.
9549 	 */
9550 	while (ifp->if_refio > 0) {
9551 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9552 		    "to be released\n", __func__, if_name(ifp));
9553 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9554 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9555 	}
9556 
9557 	VERIFY(ifp->if_datamov == 0);
9558 	VERIFY(ifp->if_drainers == 0);
9559 	VERIFY(ifp->if_suspend == 0);
9560 	ifp->if_refflags &= ~IFRF_READY;
9561 	lck_mtx_unlock(&ifp->if_ref_lock);
9562 
9563 	/* Clear agent IDs */
9564 	if (ifp->if_agentids != NULL) {
9565 		kfree_data(ifp->if_agentids,
9566 		    sizeof(uuid_t) * ifp->if_agentcount);
9567 		ifp->if_agentids = NULL;
9568 	}
9569 	ifp->if_agentcount = 0;
9570 
9571 #if SKYWALK
9572 	VERIFY(LIST_EMPTY(&ifp->if_netns_tokens));
9573 #endif /* SKYWALK */
9574 	/* Drain and destroy send queue */
9575 	ifclassq_teardown(ifp->if_snd);
9576 
9577 	/* Detach interface filters */
9578 	lck_mtx_lock(&ifp->if_flt_lock);
9579 	if_flt_monitor_enter(ifp);
9580 
9581 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9582 	fhead = ifp->if_flt_head;
9583 	TAILQ_INIT(&ifp->if_flt_head);
9584 
9585 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9586 		filter_next = TAILQ_NEXT(filter, filt_next);
9587 		lck_mtx_unlock(&ifp->if_flt_lock);
9588 
9589 		dlil_detach_filter_internal(filter, 1);
9590 		lck_mtx_lock(&ifp->if_flt_lock);
9591 	}
9592 	if_flt_monitor_leave(ifp);
9593 	lck_mtx_unlock(&ifp->if_flt_lock);
9594 
9595 	/* Tell upper layers to drop their network addresses */
9596 	if_purgeaddrs(ifp);
9597 
9598 	ifnet_lock_exclusive(ifp);
9599 
9600 	/* Unplumb all protocols */
9601 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9602 		struct if_proto *proto;
9603 
9604 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9605 		while (proto != NULL) {
9606 			protocol_family_t family = proto->protocol_family;
9607 			ifnet_lock_done(ifp);
9608 			proto_unplumb(family, ifp);
9609 			ifnet_lock_exclusive(ifp);
9610 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9611 		}
9612 		/* There should not be any protocols left */
9613 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9614 	}
9615 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9616 	ifp->if_proto_hash = NULL;
9617 
9618 	/* Detach (permanent) link address from if_addrhead */
9619 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9620 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9621 	IFA_LOCK(ifa);
9622 	if_detach_link_ifa(ifp, ifa);
9623 	IFA_UNLOCK(ifa);
9624 
9625 	/* Remove (permanent) link address from ifnet_addrs[] */
9626 	ifa_remref(ifa);
9627 	ifnet_addrs[ifp->if_index - 1] = NULL;
9628 
9629 	/* This interface should not be on {ifnet_head,detaching} */
9630 	VERIFY(ifp->if_link.tqe_next == NULL);
9631 	VERIFY(ifp->if_link.tqe_prev == NULL);
9632 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9633 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9634 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9635 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9636 
9637 	/* The slot should have been emptied */
9638 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9639 
9640 	/* There should not be any addresses left */
9641 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9642 
9643 	/*
9644 	 * Signal the starter thread to terminate itself, and wait until
9645 	 * it has exited.
9646 	 */
9647 	if (ifp->if_start_thread != THREAD_NULL) {
9648 		lck_mtx_lock_spin(&ifp->if_start_lock);
9649 		ifp->if_start_flags |= IFSF_TERMINATING;
9650 		wakeup_one((caddr_t)&ifp->if_start_thread);
9651 		lck_mtx_unlock(&ifp->if_start_lock);
9652 
9653 		/* wait for starter thread to terminate */
9654 		lck_mtx_lock(&ifp->if_start_lock);
9655 		while (ifp->if_start_thread != THREAD_NULL) {
9656 			if (dlil_verbose) {
9657 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9658 				    __func__,
9659 				    if_name(ifp));
9660 			}
9661 			(void) msleep(&ifp->if_start_thread,
9662 			    &ifp->if_start_lock, (PZERO - 1),
9663 			    "ifnet_start_thread_exit", NULL);
9664 		}
9665 		lck_mtx_unlock(&ifp->if_start_lock);
9666 		if (dlil_verbose) {
9667 			DLIL_PRINTF("%s: %s starter thread termination complete",
9668 			    __func__, if_name(ifp));
9669 		}
9670 	}
9671 
9672 	/*
9673 	 * Signal the poller thread to terminate itself, and wait until
9674 	 * it has exited.
9675 	 */
9676 	if (ifp->if_poll_thread != THREAD_NULL) {
9677 #if SKYWALK
9678 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9679 #endif /* SKYWALK */
9680 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9681 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9682 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9683 		lck_mtx_unlock(&ifp->if_poll_lock);
9684 
9685 		/* wait for poller thread to terminate */
9686 		lck_mtx_lock(&ifp->if_poll_lock);
9687 		while (ifp->if_poll_thread != THREAD_NULL) {
9688 			if (dlil_verbose) {
9689 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9690 				    __func__,
9691 				    if_name(ifp));
9692 			}
9693 			(void) msleep(&ifp->if_poll_thread,
9694 			    &ifp->if_poll_lock, (PZERO - 1),
9695 			    "ifnet_poll_thread_exit", NULL);
9696 		}
9697 		lck_mtx_unlock(&ifp->if_poll_lock);
9698 		if (dlil_verbose) {
9699 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9700 			    __func__, if_name(ifp));
9701 		}
9702 	}
9703 
9704 	/*
9705 	 * If thread affinity was set for the workloop thread, we will need
9706 	 * to tear down the affinity and release the extra reference count
9707 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9708 	 * without dedicated input threads.
9709 	 */
9710 	if ((inp = ifp->if_inp) != NULL) {
9711 		VERIFY(inp != dlil_main_input_thread);
9712 
9713 		if (inp->dlth_affinity) {
9714 			struct thread *tp, *wtp, *ptp;
9715 
9716 			lck_mtx_lock_spin(&inp->dlth_lock);
9717 			wtp = inp->dlth_driver_thread;
9718 			inp->dlth_driver_thread = THREAD_NULL;
9719 			ptp = inp->dlth_poller_thread;
9720 			inp->dlth_poller_thread = THREAD_NULL;
9721 			ASSERT(inp->dlth_thread != THREAD_NULL);
9722 			tp = inp->dlth_thread;    /* don't nullify now */
9723 			inp->dlth_affinity_tag = 0;
9724 			inp->dlth_affinity = FALSE;
9725 			lck_mtx_unlock(&inp->dlth_lock);
9726 
9727 			/* Tear down poll thread affinity */
9728 			if (ptp != NULL) {
9729 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9730 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9731 				(void) dlil_affinity_set(ptp,
9732 				    THREAD_AFFINITY_TAG_NULL);
9733 				thread_deallocate(ptp);
9734 			}
9735 
9736 			/* Tear down workloop thread affinity */
9737 			if (wtp != NULL) {
9738 				(void) dlil_affinity_set(wtp,
9739 				    THREAD_AFFINITY_TAG_NULL);
9740 				thread_deallocate(wtp);
9741 			}
9742 
9743 			/* Tear down DLIL input thread affinity */
9744 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9745 			thread_deallocate(tp);
9746 		}
9747 
9748 		/* disassociate ifp DLIL input thread */
9749 		ifp->if_inp = NULL;
9750 
9751 		/* if the worker thread was created, tell it to terminate */
9752 		if (inp->dlth_thread != THREAD_NULL) {
9753 			lck_mtx_lock_spin(&inp->dlth_lock);
9754 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9755 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9756 				wakeup_one((caddr_t)&inp->dlth_flags);
9757 			}
9758 			lck_mtx_unlock(&inp->dlth_lock);
9759 			ifnet_lock_done(ifp);
9760 
9761 			/* wait for the input thread to terminate */
9762 			lck_mtx_lock_spin(&inp->dlth_lock);
9763 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9764 			    == 0) {
9765 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9766 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9767 			}
9768 			lck_mtx_unlock(&inp->dlth_lock);
9769 			ifnet_lock_exclusive(ifp);
9770 		}
9771 
9772 		/* clean-up input thread state */
9773 		dlil_clean_threading_info(inp);
9774 		/* clean-up poll parameters */
9775 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9776 		dlil_reset_rxpoll_params(ifp);
9777 	}
9778 
9779 	/* The driver might unload, so point these to ourselves */
9780 	if_free = ifp->if_free;
9781 	ifp->if_output_dlil = ifp_if_output;
9782 	ifp->if_output = ifp_if_output;
9783 	ifp->if_pre_enqueue = ifp_if_output;
9784 	ifp->if_start = ifp_if_start;
9785 	ifp->if_output_ctl = ifp_if_ctl;
9786 	ifp->if_input_dlil = ifp_if_input;
9787 	ifp->if_input_poll = ifp_if_input_poll;
9788 	ifp->if_input_ctl = ifp_if_ctl;
9789 	ifp->if_ioctl = ifp_if_ioctl;
9790 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9791 	ifp->if_free = ifp_if_free;
9792 	ifp->if_demux = ifp_if_demux;
9793 	ifp->if_event = ifp_if_event;
9794 	ifp->if_framer_legacy = ifp_if_framer;
9795 	ifp->if_framer = ifp_if_framer_extended;
9796 	ifp->if_add_proto = ifp_if_add_proto;
9797 	ifp->if_del_proto = ifp_if_del_proto;
9798 	ifp->if_check_multi = ifp_if_check_multi;
9799 
9800 	/* wipe out interface description */
9801 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9802 	ifp->if_desc.ifd_len = 0;
9803 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9804 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9805 
9806 	/* there shouldn't be any delegation by now */
9807 	VERIFY(ifp->if_delegated.ifp == NULL);
9808 	VERIFY(ifp->if_delegated.type == 0);
9809 	VERIFY(ifp->if_delegated.family == 0);
9810 	VERIFY(ifp->if_delegated.subfamily == 0);
9811 	VERIFY(ifp->if_delegated.expensive == 0);
9812 	VERIFY(ifp->if_delegated.constrained == 0);
9813 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
9814 
9815 	/* QoS marking get cleared */
9816 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9817 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9818 
9819 #if SKYWALK
9820 	/* the nexus destructor is responsible for clearing these */
9821 	VERIFY(ifp->if_na_ops == NULL);
9822 	VERIFY(ifp->if_na == NULL);
9823 #endif /* SKYWALK */
9824 
9825 	/* promiscuous/allmulti counts need to start at zero again */
9826 	ifp->if_pcount = 0;
9827 	ifp->if_amcount = 0;
9828 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9829 
9830 	ifnet_lock_done(ifp);
9831 
9832 #if PF
9833 	/*
9834 	 * Detach this interface from packet filter, if enabled.
9835 	 */
9836 	pf_ifnet_hook(ifp, 0);
9837 #endif /* PF */
9838 
9839 	/* Filter list should be empty */
9840 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9841 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9842 	VERIFY(ifp->if_flt_busy == 0);
9843 	VERIFY(ifp->if_flt_waiters == 0);
9844 	VERIFY(ifp->if_flt_non_os_count == 0);
9845 	VERIFY(ifp->if_flt_no_tso_count == 0);
9846 	lck_mtx_unlock(&ifp->if_flt_lock);
9847 
9848 	/* Last chance to drain send queue */
9849 	if_qflush_snd(ifp, 0);
9850 
9851 	/* Last chance to cleanup any cached route */
9852 	lck_mtx_lock(&ifp->if_cached_route_lock);
9853 	VERIFY(!ifp->if_fwd_cacheok);
9854 	ROUTE_RELEASE(&ifp->if_fwd_route);
9855 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9856 	ROUTE_RELEASE(&ifp->if_src_route);
9857 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9858 	ROUTE_RELEASE(&ifp->if_src_route6);
9859 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9860 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9861 
9862 	/* Ignore any pending data threshold as the interface is anyways gone */
9863 	ifp->if_data_threshold = 0;
9864 
9865 	VERIFY(ifp->if_dt_tcall != NULL);
9866 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9867 
9868 	ifnet_llreach_ifdetach(ifp);
9869 
9870 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9871 
9872 	/*
9873 	 * Finally, mark this ifnet as detached.
9874 	 */
9875 	if (dlil_verbose) {
9876 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9877 	}
9878 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9879 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9880 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9881 		    __func__, ifp);
9882 		/* NOTREACHED */
9883 	}
9884 	ifp->if_refflags &= ~IFRF_DETACHING;
9885 	lck_mtx_unlock(&ifp->if_ref_lock);
9886 	if (if_free != NULL) {
9887 		if_free(ifp);
9888 	}
9889 
9890 	ifclassq_release(&ifp->if_snd);
9891 
9892 	/* we're fully detached, clear the "in use" bit */
9893 	dlifp = (struct dlil_ifnet *)ifp;
9894 	lck_mtx_lock(&dlifp->dl_if_lock);
9895 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9896 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9897 	lck_mtx_unlock(&dlifp->dl_if_lock);
9898 
9899 	/* Release reference held during ifnet attach */
9900 	ifnet_release(ifp);
9901 }
9902 
9903 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9904 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9905 {
9906 #pragma unused(ifp)
9907 	m_freem_list(m);
9908 	return 0;
9909 }
9910 
9911 void
ifp_if_start(struct ifnet * ifp)9912 ifp_if_start(struct ifnet *ifp)
9913 {
9914 	ifnet_purge(ifp);
9915 }
9916 
9917 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9918 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9919     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9920     boolean_t poll, struct thread *tp)
9921 {
9922 #pragma unused(ifp, m_tail, s, poll, tp)
9923 	m_freem_list(m_head);
9924 	return ENXIO;
9925 }
9926 
9927 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9928 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9929     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9930 {
9931 #pragma unused(ifp, flags, max_cnt)
9932 	if (m_head != NULL) {
9933 		*m_head = NULL;
9934 	}
9935 	if (m_tail != NULL) {
9936 		*m_tail = NULL;
9937 	}
9938 	if (cnt != NULL) {
9939 		*cnt = 0;
9940 	}
9941 	if (len != NULL) {
9942 		*len = 0;
9943 	}
9944 }
9945 
9946 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9947 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9948 {
9949 #pragma unused(ifp, cmd, arglen, arg)
9950 	return EOPNOTSUPP;
9951 }
9952 
9953 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9954 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9955 {
9956 #pragma unused(ifp, fh, pf)
9957 	m_freem(m);
9958 	return EJUSTRETURN;
9959 }
9960 
9961 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9962 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9963     const struct ifnet_demux_desc *da, u_int32_t dc)
9964 {
9965 #pragma unused(ifp, pf, da, dc)
9966 	return EINVAL;
9967 }
9968 
9969 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9970 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9971 {
9972 #pragma unused(ifp, pf)
9973 	return EINVAL;
9974 }
9975 
9976 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9977 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9978 {
9979 #pragma unused(ifp, sa)
9980 	return EOPNOTSUPP;
9981 }
9982 
9983 #if !XNU_TARGET_OS_OSX
9984 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9985 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9986     const struct sockaddr *sa, const char *ll, const char *t,
9987     u_int32_t *pre, u_int32_t *post)
9988 #else /* XNU_TARGET_OS_OSX */
9989 static errno_t
9990 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9991     const struct sockaddr *sa, const char *ll, const char *t)
9992 #endif /* XNU_TARGET_OS_OSX */
9993 {
9994 #pragma unused(ifp, m, sa, ll, t)
9995 #if !XNU_TARGET_OS_OSX
9996 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9997 #else /* XNU_TARGET_OS_OSX */
9998 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
9999 #endif /* XNU_TARGET_OS_OSX */
10000 }
10001 
10002 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10003 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10004     const struct sockaddr *sa, const char *ll, const char *t,
10005     u_int32_t *pre, u_int32_t *post)
10006 {
10007 #pragma unused(ifp, sa, ll, t)
10008 	m_freem(*m);
10009 	*m = NULL;
10010 
10011 	if (pre != NULL) {
10012 		*pre = 0;
10013 	}
10014 	if (post != NULL) {
10015 		*post = 0;
10016 	}
10017 
10018 	return EJUSTRETURN;
10019 }
10020 
10021 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10022 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10023 {
10024 #pragma unused(ifp, cmd, arg)
10025 	return EOPNOTSUPP;
10026 }
10027 
10028 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10029 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10030 {
10031 #pragma unused(ifp, tm, f)
10032 	/* XXX not sure what to do here */
10033 	return 0;
10034 }
10035 
10036 static void
ifp_if_free(struct ifnet * ifp)10037 ifp_if_free(struct ifnet *ifp)
10038 {
10039 #pragma unused(ifp)
10040 }
10041 
10042 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10043 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10044 {
10045 #pragma unused(ifp, e)
10046 }
10047 
10048 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10049 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10050     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10051 {
10052 	struct ifnet *ifp1 = NULL;
10053 	struct dlil_ifnet *dlifp1 = NULL;
10054 	struct dlil_ifnet *dlifp1_saved = NULL;
10055 	void *buf, *base, **pbuf;
10056 	int ret = 0;
10057 
10058 	VERIFY(*ifp == NULL);
10059 	dlil_if_lock();
10060 	/*
10061 	 * We absolutely can't have an interface with the same name
10062 	 * in in-use state.
10063 	 * To make sure of that list has to be traversed completely
10064 	 */
10065 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10066 		ifp1 = (struct ifnet *)dlifp1;
10067 
10068 		if (ifp1->if_family != family) {
10069 			continue;
10070 		}
10071 
10072 		/*
10073 		 * If interface is in use, return EBUSY if either unique id
10074 		 * or interface extended names are the same
10075 		 */
10076 		lck_mtx_lock(&dlifp1->dl_if_lock);
10077 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10078 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10079 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10080 			ret = EBUSY;
10081 			goto end;
10082 		}
10083 
10084 		if (uniqueid_len != 0 &&
10085 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10086 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10087 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10088 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10089 				ret = EBUSY;
10090 				goto end;
10091 			}
10092 			if (dlifp1_saved == NULL) {
10093 				/* cache the first match */
10094 				dlifp1_saved = dlifp1;
10095 			}
10096 			/*
10097 			 * Do not break or jump to end as we have to traverse
10098 			 * the whole list to ensure there are no name collisions
10099 			 */
10100 		}
10101 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10102 	}
10103 
10104 	/* If there's an interface that can be recycled, use that */
10105 	if (dlifp1_saved != NULL) {
10106 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10107 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10108 			/* some other thread got in ahead of us */
10109 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10110 			ret = EBUSY;
10111 			goto end;
10112 		}
10113 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10114 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10115 		*ifp = (struct ifnet *)dlifp1_saved;
10116 		dlil_if_ref(*ifp);
10117 		goto end;
10118 	}
10119 
10120 	/* no interface found, allocate a new one */
10121 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10122 
10123 	/* Get the 64-bit aligned base address for this object */
10124 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10125 	    sizeof(u_int64_t));
10126 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10127 
10128 	/*
10129 	 * Wind back a pointer size from the aligned base and
10130 	 * save the original address so we can free it later.
10131 	 */
10132 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10133 	*pbuf = buf;
10134 	dlifp1 = base;
10135 
10136 	if (uniqueid_len) {
10137 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10138 		    Z_WAITOK);
10139 		if (dlifp1->dl_if_uniqueid == NULL) {
10140 			zfree(dlif_zone, buf);
10141 			ret = ENOMEM;
10142 			goto end;
10143 		}
10144 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10145 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10146 	}
10147 
10148 	ifp1 = (struct ifnet *)dlifp1;
10149 	dlifp1->dl_if_flags = DLIF_INUSE;
10150 	if (ifnet_debug) {
10151 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10152 		dlifp1->dl_if_trace = dlil_if_trace;
10153 	}
10154 	ifp1->if_name = dlifp1->dl_if_namestorage;
10155 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10156 
10157 	/* initialize interface description */
10158 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10159 	ifp1->if_desc.ifd_len = 0;
10160 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10161 
10162 #if SKYWALK
10163 	LIST_INIT(&ifp1->if_netns_tokens);
10164 #endif /* SKYWALK */
10165 
10166 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10167 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10168 		    "error: %d\n", __func__, ret);
10169 		/* This probably shouldn't be fatal */
10170 		ret = 0;
10171 	}
10172 
10173 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10174 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10175 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10176 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10177 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10178 	    &ifnet_lock_attr);
10179 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10180 #if INET
10181 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10182 	    &ifnet_lock_attr);
10183 	ifp1->if_inetdata = NULL;
10184 #endif
10185 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10186 	ifp1->if_inet6_ioctl_busy = FALSE;
10187 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10188 	    &ifnet_lock_attr);
10189 	ifp1->if_inet6data = NULL;
10190 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10191 	    &ifnet_lock_attr);
10192 	ifp1->if_link_status = NULL;
10193 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10194 
10195 	/* for send data paths */
10196 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10197 	    &ifnet_lock_attr);
10198 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10199 	    &ifnet_lock_attr);
10200 
10201 	/* for receive data paths */
10202 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10203 	    &ifnet_lock_attr);
10204 
10205 	/* thread call allocation is done with sleeping zalloc */
10206 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10207 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10208 	if (ifp1->if_dt_tcall == NULL) {
10209 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10210 		/* NOTREACHED */
10211 	}
10212 
10213 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10214 
10215 	*ifp = ifp1;
10216 	dlil_if_ref(*ifp);
10217 
10218 end:
10219 	dlil_if_unlock();
10220 
10221 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10222 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10223 
10224 	return ret;
10225 }
10226 
10227 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10228 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10229 {
10230 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10231 
10232 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10233 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10234 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10235 	}
10236 
10237 	ifnet_lock_exclusive(ifp);
10238 	kfree_data_counted_by(ifp->if_broadcast.ptr, ifp->if_broadcast.length);
10239 	lck_mtx_lock(&dlifp->dl_if_lock);
10240 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10241 	ifp->if_name = dlifp->dl_if_namestorage;
10242 	/* Reset external name (name + unit) */
10243 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10244 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10245 	    "%s?", ifp->if_name);
10246 	if (clear_in_use) {
10247 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10248 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10249 	}
10250 	lck_mtx_unlock(&dlifp->dl_if_lock);
10251 	ifnet_lock_done(ifp);
10252 }
10253 
10254 __private_extern__ void
dlil_if_release(ifnet_t ifp)10255 dlil_if_release(ifnet_t ifp)
10256 {
10257 	_dlil_if_release(ifp, false);
10258 }
10259 
10260 __private_extern__ void
dlil_if_lock(void)10261 dlil_if_lock(void)
10262 {
10263 	lck_mtx_lock(&dlil_ifnet_lock);
10264 }
10265 
10266 __private_extern__ void
dlil_if_unlock(void)10267 dlil_if_unlock(void)
10268 {
10269 	lck_mtx_unlock(&dlil_ifnet_lock);
10270 }
10271 
10272 __private_extern__ void
dlil_if_lock_assert(void)10273 dlil_if_lock_assert(void)
10274 {
10275 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10276 }
10277 
10278 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10279 dlil_proto_unplumb_all(struct ifnet *ifp)
10280 {
10281 	/*
10282 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10283 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10284 	 * explicit unplumb.
10285 	 *
10286 	 * if_proto_hash[3] is for other protocols; we expect anything
10287 	 * in this bucket to respond to the DETACHING event (which would
10288 	 * have happened by now) and do the unplumb then.
10289 	 */
10290 	(void) proto_unplumb(PF_INET, ifp);
10291 	(void) proto_unplumb(PF_INET6, ifp);
10292 }
10293 
10294 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10295 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10296 {
10297 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10298 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10299 
10300 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10301 
10302 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10303 }
10304 
10305 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10306 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10307 {
10308 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10309 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10310 
10311 	if (ifp->if_fwd_cacheok) {
10312 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10313 	} else {
10314 		ROUTE_RELEASE(src);
10315 	}
10316 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10317 }
10318 
10319 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10320 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10321 {
10322 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10323 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10324 
10325 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10326 	    sizeof(*dst));
10327 
10328 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10329 }
10330 
10331 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10332 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10333 {
10334 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10335 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10336 
10337 	if (ifp->if_fwd_cacheok) {
10338 		route_copyin((struct route *)src,
10339 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10340 	} else {
10341 		ROUTE_RELEASE(src);
10342 	}
10343 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10344 }
10345 
10346 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10347 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10348 {
10349 	struct route            src_rt;
10350 	struct sockaddr_in      *dst;
10351 
10352 	dst = SIN(&src_rt.ro_dst);
10353 
10354 	ifp_src_route_copyout(ifp, &src_rt);
10355 
10356 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10357 		ROUTE_RELEASE(&src_rt);
10358 		if (dst->sin_family != AF_INET) {
10359 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10360 			dst->sin_len = sizeof(src_rt.ro_dst);
10361 			dst->sin_family = AF_INET;
10362 		}
10363 		dst->sin_addr = src_ip;
10364 
10365 		VERIFY(src_rt.ro_rt == NULL);
10366 		src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10367 		    0, 0, ifp->if_index);
10368 
10369 		if (src_rt.ro_rt != NULL) {
10370 			/* retain a ref, copyin consumes one */
10371 			struct rtentry  *rte = src_rt.ro_rt;
10372 			RT_ADDREF(rte);
10373 			ifp_src_route_copyin(ifp, &src_rt);
10374 			src_rt.ro_rt = rte;
10375 		}
10376 	}
10377 
10378 	return src_rt.ro_rt;
10379 }
10380 
10381 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10382 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10383 {
10384 	struct route_in6 src_rt;
10385 
10386 	ifp_src_route6_copyout(ifp, &src_rt);
10387 
10388 	if (ROUTE_UNUSABLE(&src_rt) ||
10389 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10390 		ROUTE_RELEASE(&src_rt);
10391 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10392 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10393 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10394 			src_rt.ro_dst.sin6_family = AF_INET6;
10395 		}
10396 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10397 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10398 		    sizeof(src_rt.ro_dst.sin6_addr));
10399 
10400 		if (src_rt.ro_rt == NULL) {
10401 			src_rt.ro_rt = rtalloc1_scoped(
10402 				SA(&src_rt.ro_dst), 0, 0,
10403 				ifp->if_index);
10404 
10405 			if (src_rt.ro_rt != NULL) {
10406 				/* retain a ref, copyin consumes one */
10407 				struct rtentry  *rte = src_rt.ro_rt;
10408 				RT_ADDREF(rte);
10409 				ifp_src_route6_copyin(ifp, &src_rt);
10410 				src_rt.ro_rt = rte;
10411 			}
10412 		}
10413 	}
10414 
10415 	return src_rt.ro_rt;
10416 }
10417 
10418 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10419 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10420 {
10421 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10422 
10423 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10424 
10425 	/* Normalize to edge */
10426 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10427 		lqm = IFNET_LQM_THRESH_ABORT;
10428 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10429 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10430 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10431 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10432 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10433 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10434 	    lqm <= IFNET_LQM_THRESH_POOR) {
10435 		lqm = IFNET_LQM_THRESH_POOR;
10436 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10437 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10438 		lqm = IFNET_LQM_THRESH_GOOD;
10439 	}
10440 
10441 	/*
10442 	 * Take the lock if needed
10443 	 */
10444 	if (!locked) {
10445 		ifnet_lock_exclusive(ifp);
10446 	}
10447 
10448 	if (lqm == ifp->if_interface_state.lqm_state &&
10449 	    (ifp->if_interface_state.valid_bitmask &
10450 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10451 		/*
10452 		 * Release the lock if was not held by the caller
10453 		 */
10454 		if (!locked) {
10455 			ifnet_lock_done(ifp);
10456 		}
10457 		return;         /* nothing to update */
10458 	}
10459 	ifp->if_interface_state.valid_bitmask |=
10460 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10461 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10462 
10463 	/*
10464 	 * Don't want to hold the lock when issuing kernel events
10465 	 */
10466 	ifnet_lock_done(ifp);
10467 
10468 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10469 	ev_lqm_data.link_quality_metric = lqm;
10470 
10471 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10472 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10473 
10474 	/*
10475 	 * Reacquire the lock for the caller
10476 	 */
10477 	if (locked) {
10478 		ifnet_lock_exclusive(ifp);
10479 	}
10480 }
10481 
10482 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10483 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10484 {
10485 	struct kev_dl_rrc_state kev;
10486 
10487 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10488 	    (ifp->if_interface_state.valid_bitmask &
10489 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10490 		return;
10491 	}
10492 
10493 	ifp->if_interface_state.valid_bitmask |=
10494 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10495 
10496 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10497 
10498 	/*
10499 	 * Don't want to hold the lock when issuing kernel events
10500 	 */
10501 	ifnet_lock_done(ifp);
10502 
10503 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10504 	kev.rrc_state = rrc_state;
10505 
10506 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10507 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10508 
10509 	ifnet_lock_exclusive(ifp);
10510 }
10511 
10512 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10513 if_state_update(struct ifnet *ifp,
10514     struct if_interface_state *if_interface_state)
10515 {
10516 	u_short if_index_available = 0;
10517 
10518 	ifnet_lock_exclusive(ifp);
10519 
10520 	if ((ifp->if_type != IFT_CELLULAR) &&
10521 	    (if_interface_state->valid_bitmask &
10522 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10523 		ifnet_lock_done(ifp);
10524 		return ENOTSUP;
10525 	}
10526 	if ((if_interface_state->valid_bitmask &
10527 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10528 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10529 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10530 		ifnet_lock_done(ifp);
10531 		return EINVAL;
10532 	}
10533 	if ((if_interface_state->valid_bitmask &
10534 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10535 	    if_interface_state->rrc_state !=
10536 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10537 	    if_interface_state->rrc_state !=
10538 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10539 		ifnet_lock_done(ifp);
10540 		return EINVAL;
10541 	}
10542 
10543 	if (if_interface_state->valid_bitmask &
10544 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10545 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10546 	}
10547 	if (if_interface_state->valid_bitmask &
10548 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10549 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10550 	}
10551 	if (if_interface_state->valid_bitmask &
10552 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10553 		ifp->if_interface_state.valid_bitmask |=
10554 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10555 		ifp->if_interface_state.interface_availability =
10556 		    if_interface_state->interface_availability;
10557 
10558 		if (ifp->if_interface_state.interface_availability ==
10559 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10560 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10561 			    __func__, if_name(ifp), ifp->if_index);
10562 			if_index_available = ifp->if_index;
10563 		} else {
10564 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10565 			    __func__, if_name(ifp), ifp->if_index);
10566 		}
10567 	}
10568 	ifnet_lock_done(ifp);
10569 
10570 	/*
10571 	 * Check if the TCP connections going on this interface should be
10572 	 * forced to send probe packets instead of waiting for TCP timers
10573 	 * to fire. This is done on an explicit notification such as
10574 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10575 	 */
10576 	if (if_index_available > 0) {
10577 		tcp_interface_send_probe(if_index_available);
10578 	}
10579 
10580 	return 0;
10581 }
10582 
10583 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10584 if_get_state(struct ifnet *ifp,
10585     struct if_interface_state *if_interface_state)
10586 {
10587 	ifnet_lock_shared(ifp);
10588 
10589 	if_interface_state->valid_bitmask = 0;
10590 
10591 	if (ifp->if_interface_state.valid_bitmask &
10592 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10593 		if_interface_state->valid_bitmask |=
10594 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10595 		if_interface_state->rrc_state =
10596 		    ifp->if_interface_state.rrc_state;
10597 	}
10598 	if (ifp->if_interface_state.valid_bitmask &
10599 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10600 		if_interface_state->valid_bitmask |=
10601 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10602 		if_interface_state->lqm_state =
10603 		    ifp->if_interface_state.lqm_state;
10604 	}
10605 	if (ifp->if_interface_state.valid_bitmask &
10606 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10607 		if_interface_state->valid_bitmask |=
10608 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10609 		if_interface_state->interface_availability =
10610 		    ifp->if_interface_state.interface_availability;
10611 	}
10612 
10613 	ifnet_lock_done(ifp);
10614 }
10615 
10616 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10617 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10618 {
10619 	if (conn_probe > 1) {
10620 		return EINVAL;
10621 	}
10622 	if (conn_probe == 0) {
10623 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10624 	} else {
10625 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10626 	}
10627 
10628 #if NECP
10629 	necp_update_all_clients();
10630 #endif /* NECP */
10631 
10632 	tcp_probe_connectivity(ifp, conn_probe);
10633 	return 0;
10634 }
10635 
10636 /* for uuid.c */
10637 static int
get_ether_index(int * ret_other_index)10638 get_ether_index(int * ret_other_index)
10639 {
10640 	struct ifnet *ifp;
10641 	int en0_index = 0;
10642 	int other_en_index = 0;
10643 	int any_ether_index = 0;
10644 	short best_unit = 0;
10645 
10646 	*ret_other_index = 0;
10647 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10648 		/*
10649 		 * find en0, or if not en0, the lowest unit en*, and if not
10650 		 * that, any ethernet
10651 		 */
10652 		ifnet_lock_shared(ifp);
10653 		if (strcmp(ifp->if_name, "en") == 0) {
10654 			if (ifp->if_unit == 0) {
10655 				/* found en0, we're done */
10656 				en0_index = ifp->if_index;
10657 				ifnet_lock_done(ifp);
10658 				break;
10659 			}
10660 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10661 				other_en_index = ifp->if_index;
10662 				best_unit = ifp->if_unit;
10663 			}
10664 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10665 			any_ether_index = ifp->if_index;
10666 		}
10667 		ifnet_lock_done(ifp);
10668 	}
10669 	if (en0_index == 0) {
10670 		if (other_en_index != 0) {
10671 			*ret_other_index = other_en_index;
10672 		} else if (any_ether_index != 0) {
10673 			*ret_other_index = any_ether_index;
10674 		}
10675 	}
10676 	return en0_index;
10677 }
10678 
10679 int
uuid_get_ethernet(u_int8_t * node)10680 uuid_get_ethernet(u_int8_t *node)
10681 {
10682 	static int en0_index;
10683 	struct ifnet *ifp;
10684 	int other_index = 0;
10685 	int the_index = 0;
10686 	int ret;
10687 
10688 	ifnet_head_lock_shared();
10689 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10690 		en0_index = get_ether_index(&other_index);
10691 	}
10692 	if (en0_index != 0) {
10693 		the_index = en0_index;
10694 	} else if (other_index != 0) {
10695 		the_index = other_index;
10696 	}
10697 	if (the_index != 0) {
10698 		struct dlil_ifnet *dl_if;
10699 
10700 		ifp = ifindex2ifnet[the_index];
10701 		VERIFY(ifp != NULL);
10702 		dl_if = (struct dlil_ifnet *)ifp;
10703 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10704 			/*
10705 			 * Use the permanent ethernet address if it is
10706 			 * available because it will never change.
10707 			 */
10708 			memcpy(node, dl_if->dl_if_permanent_ether,
10709 			    ETHER_ADDR_LEN);
10710 		} else {
10711 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10712 		}
10713 		ret = 0;
10714 	} else {
10715 		ret = -1;
10716 	}
10717 	ifnet_head_done();
10718 	return ret;
10719 }
10720 
10721 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10722 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10723     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10724 {
10725 	struct kev_dl_node_presence kev;
10726 	struct sockaddr_dl *sdl;
10727 	struct sockaddr_in6 *sin6;
10728 	int ret = 0;
10729 
10730 	VERIFY(ifp);
10731 	VERIFY(sa);
10732 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10733 
10734 	bzero(&kev, sizeof(kev));
10735 	sin6 = &kev.sin6_node_address;
10736 	sdl = &kev.sdl_node_address;
10737 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10738 	kev.rssi = rssi;
10739 	kev.link_quality_metric = lqm;
10740 	kev.node_proximity_metric = npm;
10741 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10742 
10743 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10744 	if (ret == 0 || ret == EEXIST) {
10745 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10746 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10747 		if (err != 0) {
10748 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10749 			    "error %d\n", __func__, err);
10750 		}
10751 	}
10752 
10753 	if (ret == EEXIST) {
10754 		ret = 0;
10755 	}
10756 	return ret;
10757 }
10758 
10759 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10760 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10761 {
10762 	struct kev_dl_node_absence kev = {};
10763 	struct sockaddr_in6 *kev_sin6 = NULL;
10764 	struct sockaddr_dl *kev_sdl = NULL;
10765 	int error = 0;
10766 
10767 	VERIFY(ifp != NULL);
10768 	VERIFY(sa != NULL);
10769 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10770 
10771 	kev_sin6 = &kev.sin6_node_address;
10772 	kev_sdl = &kev.sdl_node_address;
10773 
10774 	if (sa->sa_family == AF_INET6) {
10775 		/*
10776 		 * If IPv6 address is given, get the link layer
10777 		 * address from what was cached in the neighbor cache
10778 		 */
10779 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10780 		bcopy(sa, kev_sin6, sa->sa_len);
10781 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10782 	} else {
10783 		/*
10784 		 * If passed address is AF_LINK type, derive the address
10785 		 * based on the link address.
10786 		 */
10787 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10788 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10789 	}
10790 
10791 	if (error == 0) {
10792 		kev_sdl->sdl_type = ifp->if_type;
10793 		kev_sdl->sdl_index = ifp->if_index;
10794 
10795 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10796 		    &kev.link_data, sizeof(kev), FALSE);
10797 	}
10798 }
10799 
10800 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10801 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10802     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10803 {
10804 	struct kev_dl_node_presence kev = {};
10805 	struct sockaddr_dl *kev_sdl = NULL;
10806 	struct sockaddr_in6 *kev_sin6 = NULL;
10807 	int ret = 0;
10808 
10809 	VERIFY(ifp != NULL);
10810 	VERIFY(sa != NULL && sdl != NULL);
10811 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10812 
10813 	kev_sin6 = &kev.sin6_node_address;
10814 	kev_sdl = &kev.sdl_node_address;
10815 
10816 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10817 	bcopy(sdl, kev_sdl, sdl->sdl_len);
10818 	kev_sdl->sdl_type = ifp->if_type;
10819 	kev_sdl->sdl_index = ifp->if_index;
10820 
10821 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10822 	bcopy(sa, kev_sin6, sa->sa_len);
10823 
10824 	kev.rssi = rssi;
10825 	kev.link_quality_metric = lqm;
10826 	kev.node_proximity_metric = npm;
10827 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10828 
10829 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10830 	if (ret == 0 || ret == EEXIST) {
10831 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10832 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10833 		if (err != 0) {
10834 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10835 		}
10836 	}
10837 
10838 	if (ret == EEXIST) {
10839 		ret = 0;
10840 	}
10841 	return ret;
10842 }
10843 
10844 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10845 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10846     kauth_cred_t *credp)
10847 {
10848 	const u_int8_t *bytes;
10849 	size_t size;
10850 
10851 	bytes = CONST_LLADDR(sdl);
10852 	size = sdl->sdl_alen;
10853 
10854 #if CONFIG_MACF
10855 	if (dlil_lladdr_ckreq) {
10856 		switch (sdl->sdl_type) {
10857 		case IFT_ETHER:
10858 		case IFT_IEEE1394:
10859 			break;
10860 		default:
10861 			credp = NULL;
10862 			break;
10863 		}
10864 		;
10865 
10866 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10867 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10868 				[0] = 2
10869 			};
10870 
10871 			bytes = unspec;
10872 		}
10873 	}
10874 #else
10875 #pragma unused(credp)
10876 #endif
10877 
10878 	if (sizep != NULL) {
10879 		*sizep = size;
10880 	}
10881 	return bytes;
10882 }
10883 
10884 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10885 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10886     u_int8_t info[DLIL_MODARGLEN])
10887 {
10888 	struct kev_dl_issues kev;
10889 	struct timeval tv;
10890 
10891 	VERIFY(ifp != NULL);
10892 	VERIFY(modid != NULL);
10893 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10894 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10895 
10896 	bzero(&kev, sizeof(kev));
10897 
10898 	microtime(&tv);
10899 	kev.timestamp = tv.tv_sec;
10900 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10901 	if (info != NULL) {
10902 		bcopy(info, &kev.info, DLIL_MODARGLEN);
10903 	}
10904 
10905 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10906 	    &kev.link_data, sizeof(kev), FALSE);
10907 }
10908 
10909 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10910 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10911     struct proc *p)
10912 {
10913 	u_int32_t level = IFNET_THROTTLE_OFF;
10914 	errno_t result = 0;
10915 
10916 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10917 
10918 	if (cmd == SIOCSIFOPPORTUNISTIC) {
10919 		/*
10920 		 * XXX: Use priv_check_cred() instead of root check?
10921 		 */
10922 		if ((result = proc_suser(p)) != 0) {
10923 			return result;
10924 		}
10925 
10926 		if (ifr->ifr_opportunistic.ifo_flags ==
10927 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
10928 			level = IFNET_THROTTLE_OPPORTUNISTIC;
10929 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10930 			level = IFNET_THROTTLE_OFF;
10931 		} else {
10932 			result = EINVAL;
10933 		}
10934 
10935 		if (result == 0) {
10936 			result = ifnet_set_throttle(ifp, level);
10937 		}
10938 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10939 		ifr->ifr_opportunistic.ifo_flags = 0;
10940 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10941 			ifr->ifr_opportunistic.ifo_flags |=
10942 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
10943 		}
10944 	}
10945 
10946 	/*
10947 	 * Return the count of current opportunistic connections
10948 	 * over the interface.
10949 	 */
10950 	if (result == 0) {
10951 		uint32_t flags = 0;
10952 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10953 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
10954 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10955 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10956 		ifr->ifr_opportunistic.ifo_inuse =
10957 		    udp_count_opportunistic(ifp->if_index, flags) +
10958 		    tcp_count_opportunistic(ifp->if_index, flags);
10959 	}
10960 
10961 	if (result == EALREADY) {
10962 		result = 0;
10963 	}
10964 
10965 	return result;
10966 }
10967 
10968 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10969 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10970 {
10971 	struct ifclassq *ifq;
10972 	int err = 0;
10973 
10974 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10975 		return ENXIO;
10976 	}
10977 
10978 	*level = IFNET_THROTTLE_OFF;
10979 
10980 	ifq = ifp->if_snd;
10981 	IFCQ_LOCK(ifq);
10982 	/* Throttling works only for IFCQ, not ALTQ instances */
10983 	if (IFCQ_IS_ENABLED(ifq)) {
10984 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10985 
10986 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10987 		*level = req.level;
10988 	}
10989 	IFCQ_UNLOCK(ifq);
10990 
10991 	return err;
10992 }
10993 
10994 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10995 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10996 {
10997 	struct ifclassq *ifq;
10998 	int err = 0;
10999 
11000 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11001 		return ENXIO;
11002 	}
11003 
11004 	ifq = ifp->if_snd;
11005 
11006 	switch (level) {
11007 	case IFNET_THROTTLE_OFF:
11008 	case IFNET_THROTTLE_OPPORTUNISTIC:
11009 		break;
11010 	default:
11011 		return EINVAL;
11012 	}
11013 
11014 	IFCQ_LOCK(ifq);
11015 	if (IFCQ_IS_ENABLED(ifq)) {
11016 		cqrq_throttle_t req = { 1, level };
11017 
11018 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11019 	}
11020 	IFCQ_UNLOCK(ifq);
11021 
11022 	if (err == 0) {
11023 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11024 		    level);
11025 #if NECP
11026 		necp_update_all_clients();
11027 #endif /* NECP */
11028 		if (level == IFNET_THROTTLE_OFF) {
11029 			ifnet_start(ifp);
11030 		}
11031 	}
11032 
11033 	return err;
11034 }
11035 
11036 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11037 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11038     struct proc *p)
11039 {
11040 #pragma unused(p)
11041 	errno_t result = 0;
11042 	uint32_t flags;
11043 	int level, category, subcategory;
11044 
11045 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11046 
11047 	if (cmd == SIOCSIFLOG) {
11048 		if ((result = priv_check_cred(kauth_cred_get(),
11049 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11050 			return result;
11051 		}
11052 
11053 		level = ifr->ifr_log.ifl_level;
11054 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11055 			result = EINVAL;
11056 		}
11057 
11058 		flags = ifr->ifr_log.ifl_flags;
11059 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11060 			result = EINVAL;
11061 		}
11062 
11063 		category = ifr->ifr_log.ifl_category;
11064 		subcategory = ifr->ifr_log.ifl_subcategory;
11065 
11066 		if (result == 0) {
11067 			result = ifnet_set_log(ifp, level, flags,
11068 			    category, subcategory);
11069 		}
11070 	} else {
11071 		result = ifnet_get_log(ifp, &level, &flags, &category,
11072 		    &subcategory);
11073 		if (result == 0) {
11074 			ifr->ifr_log.ifl_level = level;
11075 			ifr->ifr_log.ifl_flags = flags;
11076 			ifr->ifr_log.ifl_category = category;
11077 			ifr->ifr_log.ifl_subcategory = subcategory;
11078 		}
11079 	}
11080 
11081 	return result;
11082 }
11083 
11084 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11085 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11086     int32_t category, int32_t subcategory)
11087 {
11088 	int err = 0;
11089 
11090 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11091 	VERIFY(flags & IFNET_LOGF_MASK);
11092 
11093 	/*
11094 	 * The logging level applies to all facilities; make sure to
11095 	 * update them all with the most current level.
11096 	 */
11097 	flags |= ifp->if_log.flags;
11098 
11099 	if (ifp->if_output_ctl != NULL) {
11100 		struct ifnet_log_params l;
11101 
11102 		bzero(&l, sizeof(l));
11103 		l.level = level;
11104 		l.flags = flags;
11105 		l.flags &= ~IFNET_LOGF_DLIL;
11106 		l.category = category;
11107 		l.subcategory = subcategory;
11108 
11109 		/* Send this request to lower layers */
11110 		if (l.flags != 0) {
11111 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11112 			    sizeof(l), &l);
11113 		}
11114 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11115 		/*
11116 		 * If targeted to the lower layers without an output
11117 		 * control callback registered on the interface, just
11118 		 * silently ignore facilities other than ours.
11119 		 */
11120 		flags &= IFNET_LOGF_DLIL;
11121 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11122 			level = 0;
11123 		}
11124 	}
11125 
11126 	if (err == 0) {
11127 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11128 			ifp->if_log.flags = 0;
11129 		} else {
11130 			ifp->if_log.flags |= flags;
11131 		}
11132 
11133 		log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11134 		    "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11135 		    ifp->if_log.level, ifp->if_log.flags, flags,
11136 		    category, subcategory);
11137 	}
11138 
11139 	return err;
11140 }
11141 
11142 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11143 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11144     int32_t *category, int32_t *subcategory)
11145 {
11146 	if (level != NULL) {
11147 		*level = ifp->if_log.level;
11148 	}
11149 	if (flags != NULL) {
11150 		*flags = ifp->if_log.flags;
11151 	}
11152 	if (category != NULL) {
11153 		*category = ifp->if_log.category;
11154 	}
11155 	if (subcategory != NULL) {
11156 		*subcategory = ifp->if_log.subcategory;
11157 	}
11158 
11159 	return 0;
11160 }
11161 
11162 int
ifnet_notify_address(struct ifnet * ifp,int af)11163 ifnet_notify_address(struct ifnet *ifp, int af)
11164 {
11165 	struct ifnet_notify_address_params na;
11166 
11167 #if PF
11168 	(void) pf_ifaddr_hook(ifp);
11169 #endif /* PF */
11170 
11171 	if (ifp->if_output_ctl == NULL) {
11172 		return EOPNOTSUPP;
11173 	}
11174 
11175 	bzero(&na, sizeof(na));
11176 	na.address_family = (sa_family_t)af;
11177 
11178 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11179 	           sizeof(na), &na);
11180 }
11181 
11182 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11183 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11184 {
11185 	if (ifp == NULL || flowid == NULL) {
11186 		return EINVAL;
11187 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11188 	    !IF_FULLY_ATTACHED(ifp)) {
11189 		return ENXIO;
11190 	}
11191 
11192 	*flowid = ifp->if_flowhash;
11193 
11194 	return 0;
11195 }
11196 
11197 errno_t
ifnet_disable_output(struct ifnet * ifp)11198 ifnet_disable_output(struct ifnet *ifp)
11199 {
11200 	int err = 0;
11201 
11202 	if (ifp == NULL) {
11203 		return EINVAL;
11204 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11205 	    !IF_FULLY_ATTACHED(ifp)) {
11206 		return ENXIO;
11207 	}
11208 
11209 	lck_mtx_lock(&ifp->if_start_lock);
11210 	if (ifp->if_start_flags & IFSF_FLOW_RESUME_PENDING) {
11211 		ifp->if_start_flags &= ~(IFSF_FLOW_RESUME_PENDING | IFSF_FLOW_CONTROLLED);
11212 	} else if ((err = ifnet_fc_add(ifp)) == 0) {
11213 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11214 	}
11215 	lck_mtx_unlock(&ifp->if_start_lock);
11216 
11217 	return err;
11218 }
11219 
11220 errno_t
ifnet_enable_output(struct ifnet * ifp)11221 ifnet_enable_output(struct ifnet *ifp)
11222 {
11223 	if (ifp == NULL) {
11224 		return EINVAL;
11225 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11226 	    !IF_FULLY_ATTACHED(ifp)) {
11227 		return ENXIO;
11228 	}
11229 
11230 	ifnet_start_common(ifp, TRUE, FALSE);
11231 	return 0;
11232 }
11233 
11234 void
ifnet_flowadv(uint32_t flowhash)11235 ifnet_flowadv(uint32_t flowhash)
11236 {
11237 	struct ifnet_fc_entry *ifce;
11238 	struct ifnet *ifp;
11239 
11240 	ifce = ifnet_fc_get(flowhash);
11241 	if (ifce == NULL) {
11242 		return;
11243 	}
11244 
11245 	VERIFY(ifce->ifce_ifp != NULL);
11246 	ifp = ifce->ifce_ifp;
11247 
11248 	/* flow hash gets recalculated per attach, so check */
11249 	if (ifnet_is_attached(ifp, 1)) {
11250 		if (ifp->if_flowhash == flowhash) {
11251 			lck_mtx_lock_spin(&ifp->if_start_lock);
11252 			if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) {
11253 				ifp->if_start_flags |= IFSF_FLOW_RESUME_PENDING;
11254 			}
11255 			lck_mtx_unlock(&ifp->if_start_lock);
11256 			(void) ifnet_enable_output(ifp);
11257 		}
11258 		ifnet_decr_iorefcnt(ifp);
11259 	}
11260 	ifnet_fc_entry_free(ifce);
11261 }
11262 
11263 /*
11264  * Function to compare ifnet_fc_entries in ifnet flow control tree
11265  */
11266 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11267 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11268 {
11269 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11270 }
11271 
11272 static int
ifnet_fc_add(struct ifnet * ifp)11273 ifnet_fc_add(struct ifnet *ifp)
11274 {
11275 	struct ifnet_fc_entry keyfc, *ifce;
11276 	uint32_t flowhash;
11277 
11278 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11279 	VERIFY(ifp->if_flowhash != 0);
11280 	flowhash = ifp->if_flowhash;
11281 
11282 	bzero(&keyfc, sizeof(keyfc));
11283 	keyfc.ifce_flowhash = flowhash;
11284 
11285 	lck_mtx_lock_spin(&ifnet_fc_lock);
11286 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11287 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11288 		/* Entry is already in ifnet_fc_tree, return */
11289 		lck_mtx_unlock(&ifnet_fc_lock);
11290 		return 0;
11291 	}
11292 
11293 	if (ifce != NULL) {
11294 		/*
11295 		 * There is a different fc entry with the same flow hash
11296 		 * but different ifp pointer.  There can be a collision
11297 		 * on flow hash but the probability is low.  Let's just
11298 		 * avoid adding a second one when there is a collision.
11299 		 */
11300 		lck_mtx_unlock(&ifnet_fc_lock);
11301 		return EAGAIN;
11302 	}
11303 
11304 	/* become regular mutex */
11305 	lck_mtx_convert_spin(&ifnet_fc_lock);
11306 
11307 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11308 	ifce->ifce_flowhash = flowhash;
11309 	ifce->ifce_ifp = ifp;
11310 
11311 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11312 	lck_mtx_unlock(&ifnet_fc_lock);
11313 	return 0;
11314 }
11315 
11316 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11317 ifnet_fc_get(uint32_t flowhash)
11318 {
11319 	struct ifnet_fc_entry keyfc, *ifce;
11320 	struct ifnet *ifp;
11321 
11322 	bzero(&keyfc, sizeof(keyfc));
11323 	keyfc.ifce_flowhash = flowhash;
11324 
11325 	lck_mtx_lock_spin(&ifnet_fc_lock);
11326 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11327 	if (ifce == NULL) {
11328 		/* Entry is not present in ifnet_fc_tree, return */
11329 		lck_mtx_unlock(&ifnet_fc_lock);
11330 		return NULL;
11331 	}
11332 
11333 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11334 
11335 	VERIFY(ifce->ifce_ifp != NULL);
11336 	ifp = ifce->ifce_ifp;
11337 
11338 	/* become regular mutex */
11339 	lck_mtx_convert_spin(&ifnet_fc_lock);
11340 
11341 	if (!ifnet_is_attached(ifp, 0)) {
11342 		/*
11343 		 * This ifp is not attached or in the process of being
11344 		 * detached; just don't process it.
11345 		 */
11346 		ifnet_fc_entry_free(ifce);
11347 		ifce = NULL;
11348 	}
11349 	lck_mtx_unlock(&ifnet_fc_lock);
11350 
11351 	return ifce;
11352 }
11353 
11354 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11355 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11356 {
11357 	zfree(ifnet_fc_zone, ifce);
11358 }
11359 
11360 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11361 ifnet_calc_flowhash(struct ifnet *ifp)
11362 {
11363 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11364 	uint32_t flowhash = 0;
11365 
11366 	if (ifnet_flowhash_seed == 0) {
11367 		ifnet_flowhash_seed = RandomULong();
11368 	}
11369 
11370 	bzero(&fh, sizeof(fh));
11371 
11372 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11373 	fh.ifk_unit = ifp->if_unit;
11374 	fh.ifk_flags = ifp->if_flags;
11375 	fh.ifk_eflags = ifp->if_eflags;
11376 	fh.ifk_capabilities = ifp->if_capabilities;
11377 	fh.ifk_capenable = ifp->if_capenable;
11378 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11379 	fh.ifk_rand1 = RandomULong();
11380 	fh.ifk_rand2 = RandomULong();
11381 
11382 try_again:
11383 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11384 	if (flowhash == 0) {
11385 		/* try to get a non-zero flowhash */
11386 		ifnet_flowhash_seed = RandomULong();
11387 		goto try_again;
11388 	}
11389 
11390 	return flowhash;
11391 }
11392 
11393 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11394 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11395     uint16_t flags, uint8_t *data)
11396 {
11397 #pragma unused(flags)
11398 	int error = 0;
11399 
11400 	switch (family) {
11401 	case AF_INET:
11402 		if_inetdata_lock_exclusive(ifp);
11403 		if (IN_IFEXTRA(ifp) != NULL) {
11404 			if (len == 0) {
11405 				/* Allow clearing the signature */
11406 				IN_IFEXTRA(ifp)->netsig_len = 0;
11407 				bzero(IN_IFEXTRA(ifp)->netsig,
11408 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11409 				if_inetdata_lock_done(ifp);
11410 				break;
11411 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11412 				error = EINVAL;
11413 				if_inetdata_lock_done(ifp);
11414 				break;
11415 			}
11416 			IN_IFEXTRA(ifp)->netsig_len = len;
11417 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11418 		} else {
11419 			error = ENOMEM;
11420 		}
11421 		if_inetdata_lock_done(ifp);
11422 		break;
11423 
11424 	case AF_INET6:
11425 		if_inet6data_lock_exclusive(ifp);
11426 		if (IN6_IFEXTRA(ifp) != NULL) {
11427 			if (len == 0) {
11428 				/* Allow clearing the signature */
11429 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11430 				bzero(IN6_IFEXTRA(ifp)->netsig,
11431 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11432 				if_inet6data_lock_done(ifp);
11433 				break;
11434 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11435 				error = EINVAL;
11436 				if_inet6data_lock_done(ifp);
11437 				break;
11438 			}
11439 			IN6_IFEXTRA(ifp)->netsig_len = len;
11440 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11441 		} else {
11442 			error = ENOMEM;
11443 		}
11444 		if_inet6data_lock_done(ifp);
11445 		break;
11446 
11447 	default:
11448 		error = EINVAL;
11449 		break;
11450 	}
11451 
11452 	return error;
11453 }
11454 
11455 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11456 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11457     uint16_t *flags, uint8_t *data)
11458 {
11459 	int error = 0;
11460 
11461 	if (ifp == NULL || len == NULL || data == NULL) {
11462 		return EINVAL;
11463 	}
11464 
11465 	switch (family) {
11466 	case AF_INET:
11467 		if_inetdata_lock_shared(ifp);
11468 		if (IN_IFEXTRA(ifp) != NULL) {
11469 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11470 				error = EINVAL;
11471 				if_inetdata_lock_done(ifp);
11472 				break;
11473 			}
11474 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11475 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11476 			} else {
11477 				error = ENOENT;
11478 			}
11479 		} else {
11480 			error = ENOMEM;
11481 		}
11482 		if_inetdata_lock_done(ifp);
11483 		break;
11484 
11485 	case AF_INET6:
11486 		if_inet6data_lock_shared(ifp);
11487 		if (IN6_IFEXTRA(ifp) != NULL) {
11488 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11489 				error = EINVAL;
11490 				if_inet6data_lock_done(ifp);
11491 				break;
11492 			}
11493 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11494 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11495 			} else {
11496 				error = ENOENT;
11497 			}
11498 		} else {
11499 			error = ENOMEM;
11500 		}
11501 		if_inet6data_lock_done(ifp);
11502 		break;
11503 
11504 	default:
11505 		error = EINVAL;
11506 		break;
11507 	}
11508 
11509 	if (error == 0 && flags != NULL) {
11510 		*flags = 0;
11511 	}
11512 
11513 	return error;
11514 }
11515 
11516 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11517 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11518 {
11519 	int i, error = 0, one_set = 0;
11520 
11521 	if_inet6data_lock_exclusive(ifp);
11522 
11523 	if (IN6_IFEXTRA(ifp) == NULL) {
11524 		error = ENOMEM;
11525 		goto out;
11526 	}
11527 
11528 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11529 		uint32_t prefix_len =
11530 		    prefixes[i].prefix_len;
11531 		struct in6_addr *prefix =
11532 		    &prefixes[i].ipv6_prefix;
11533 
11534 		if (prefix_len == 0) {
11535 			clat_log0((LOG_DEBUG,
11536 			    "NAT64 prefixes purged from Interface %s\n",
11537 			    if_name(ifp)));
11538 			/* Allow clearing the signature */
11539 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11540 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11541 			    sizeof(struct in6_addr));
11542 
11543 			continue;
11544 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11545 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11546 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11547 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11548 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11549 		    prefix_len != NAT64_PREFIX_LEN_96) {
11550 			clat_log0((LOG_DEBUG,
11551 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11552 			error = EINVAL;
11553 			goto out;
11554 		}
11555 
11556 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11557 			clat_log0((LOG_DEBUG,
11558 			    "NAT64 prefix has interface/link local scope.\n"));
11559 			error = EINVAL;
11560 			goto out;
11561 		}
11562 
11563 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11564 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11565 		    sizeof(struct in6_addr));
11566 		clat_log0((LOG_DEBUG,
11567 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11568 		    ip6_sprintf(prefix), prefix_len));
11569 		one_set = 1;
11570 	}
11571 
11572 out:
11573 	if_inet6data_lock_done(ifp);
11574 
11575 	if (error == 0 && one_set != 0) {
11576 		necp_update_all_clients();
11577 	}
11578 
11579 	return error;
11580 }
11581 
11582 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11583 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11584 {
11585 	int i, found_one = 0, error = 0;
11586 
11587 	if (ifp == NULL) {
11588 		return EINVAL;
11589 	}
11590 
11591 	if_inet6data_lock_shared(ifp);
11592 
11593 	if (IN6_IFEXTRA(ifp) == NULL) {
11594 		error = ENOMEM;
11595 		goto out;
11596 	}
11597 
11598 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11599 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11600 			found_one = 1;
11601 		}
11602 	}
11603 
11604 	if (found_one == 0) {
11605 		error = ENOENT;
11606 		goto out;
11607 	}
11608 
11609 	if (prefixes) {
11610 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11611 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11612 	}
11613 
11614 out:
11615 	if_inet6data_lock_done(ifp);
11616 
11617 	return error;
11618 }
11619 
11620 __attribute__((noinline))
11621 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11622 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11623     protocol_family_t pf)
11624 {
11625 #pragma unused(ifp)
11626 	uint32_t did_sw;
11627 
11628 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11629 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11630 		return;
11631 	}
11632 
11633 	switch (pf) {
11634 	case PF_INET:
11635 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11636 		if (did_sw & CSUM_DELAY_IP) {
11637 			hwcksum_dbg_finalized_hdr++;
11638 		}
11639 		if (did_sw & CSUM_DELAY_DATA) {
11640 			hwcksum_dbg_finalized_data++;
11641 		}
11642 		break;
11643 	case PF_INET6:
11644 		/*
11645 		 * Checksum offload should not have been enabled when
11646 		 * extension headers exist; that also means that we
11647 		 * cannot force-finalize packets with extension headers.
11648 		 * Indicate to the callee should it skip such case by
11649 		 * setting optlen to -1.
11650 		 */
11651 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11652 		    m->m_pkthdr.csum_flags);
11653 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11654 			hwcksum_dbg_finalized_data++;
11655 		}
11656 		break;
11657 	default:
11658 		return;
11659 	}
11660 }
11661 
11662 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11663 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11664     protocol_family_t pf)
11665 {
11666 	uint16_t sum = 0;
11667 	uint32_t hlen;
11668 
11669 	if (frame_header == NULL ||
11670 	    frame_header < (char *)mbuf_datastart(m) ||
11671 	    frame_header > (char *)m->m_data) {
11672 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11673 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11674 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11675 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11676 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11677 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11678 		return;
11679 	}
11680 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
11681 
11682 	switch (pf) {
11683 	case PF_INET:
11684 	case PF_INET6:
11685 		break;
11686 	default:
11687 		return;
11688 	}
11689 
11690 	/*
11691 	 * Force partial checksum offload; useful to simulate cases
11692 	 * where the hardware does not support partial checksum offload,
11693 	 * in order to validate correctness throughout the layers above.
11694 	 */
11695 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11696 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11697 
11698 		if (foff > (uint32_t)m->m_pkthdr.len) {
11699 			return;
11700 		}
11701 
11702 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11703 
11704 		/* Compute 16-bit 1's complement sum from forced offset */
11705 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11706 
11707 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11708 		m->m_pkthdr.csum_rx_val = sum;
11709 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11710 
11711 		hwcksum_dbg_partial_forced++;
11712 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11713 	}
11714 
11715 	/*
11716 	 * Partial checksum offload verification (and adjustment);
11717 	 * useful to validate and test cases where the hardware
11718 	 * supports partial checksum offload.
11719 	 */
11720 	if ((m->m_pkthdr.csum_flags &
11721 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11722 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11723 		uint32_t rxoff;
11724 
11725 		/* Start offset must begin after frame header */
11726 		rxoff = m->m_pkthdr.csum_rx_start;
11727 		if (hlen > rxoff) {
11728 			hwcksum_dbg_bad_rxoff++;
11729 			if (dlil_verbose) {
11730 				DLIL_PRINTF("%s: partial cksum start offset %d "
11731 				    "is less than frame header length %d for "
11732 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11733 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11734 			}
11735 			return;
11736 		}
11737 		rxoff -= hlen;
11738 
11739 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11740 			/*
11741 			 * Compute the expected 16-bit 1's complement sum;
11742 			 * skip this if we've already computed it above
11743 			 * when partial checksum offload is forced.
11744 			 */
11745 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11746 
11747 			/* Hardware or driver is buggy */
11748 			if (sum != m->m_pkthdr.csum_rx_val) {
11749 				hwcksum_dbg_bad_cksum++;
11750 				if (dlil_verbose) {
11751 					DLIL_PRINTF("%s: bad partial cksum value "
11752 					    "0x%x (expected 0x%x) for mbuf "
11753 					    "0x%llx [rx_start %d]\n",
11754 					    if_name(ifp),
11755 					    m->m_pkthdr.csum_rx_val, sum,
11756 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11757 					    m->m_pkthdr.csum_rx_start);
11758 				}
11759 				return;
11760 			}
11761 		}
11762 		hwcksum_dbg_verified++;
11763 
11764 		/*
11765 		 * This code allows us to emulate various hardwares that
11766 		 * perform 16-bit 1's complement sum beginning at various
11767 		 * start offset values.
11768 		 */
11769 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11770 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11771 
11772 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11773 				return;
11774 			}
11775 
11776 			sum = m_adj_sum16(m, rxoff, aoff,
11777 			    m_pktlen(m) - aoff, sum);
11778 
11779 			m->m_pkthdr.csum_rx_val = sum;
11780 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11781 
11782 			hwcksum_dbg_adjusted++;
11783 		}
11784 	}
11785 }
11786 
11787 #if DEBUG || DEVELOPMENT
11788 /* Blob for sum16 verification */
11789 static uint8_t sumdata[] = {
11790 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11791 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11792 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11793 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11794 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11795 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11796 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11797 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11798 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11799 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11800 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11801 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11802 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11803 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11804 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11805 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11806 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11807 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11808 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11809 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11810 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11811 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11812 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11813 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11814 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11815 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11816 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11817 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11818 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11819 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11820 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11821 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11822 	0xc8, 0x28, 0x02, 0x00, 0x00
11823 };
11824 
11825 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11826 static struct {
11827 	boolean_t       init;
11828 	uint16_t        len;
11829 	uint16_t        sumr;   /* reference */
11830 	uint16_t        sumrp;  /* reference, precomputed */
11831 } sumtbl[] = {
11832 	{ FALSE, 0, 0, 0x0000 },
11833 	{ FALSE, 1, 0, 0x001f },
11834 	{ FALSE, 2, 0, 0x8b1f },
11835 	{ FALSE, 3, 0, 0x8b27 },
11836 	{ FALSE, 7, 0, 0x790e },
11837 	{ FALSE, 11, 0, 0xcb6d },
11838 	{ FALSE, 20, 0, 0x20dd },
11839 	{ FALSE, 27, 0, 0xbabd },
11840 	{ FALSE, 32, 0, 0xf3e8 },
11841 	{ FALSE, 37, 0, 0x197d },
11842 	{ FALSE, 43, 0, 0x9eae },
11843 	{ FALSE, 64, 0, 0x4678 },
11844 	{ FALSE, 127, 0, 0x9399 },
11845 	{ FALSE, 256, 0, 0xd147 },
11846 	{ FALSE, 325, 0, 0x0358 },
11847 };
11848 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11849 
11850 static void
dlil_verify_sum16(void)11851 dlil_verify_sum16(void)
11852 {
11853 	struct mbuf *m;
11854 	uint8_t *buf;
11855 	int n;
11856 
11857 	/* Make sure test data plus extra room for alignment fits in cluster */
11858 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11859 
11860 	kprintf("DLIL: running SUM16 self-tests ... ");
11861 
11862 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11863 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11864 
11865 	buf = mtod(m, uint8_t *);               /* base address */
11866 
11867 	for (n = 0; n < SUMTBL_MAX; n++) {
11868 		uint16_t len = sumtbl[n].len;
11869 		int i;
11870 
11871 		/* Verify for all possible alignments */
11872 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
11873 			uint16_t sum, sumr;
11874 			uint8_t *c;
11875 
11876 			/* Copy over test data to mbuf */
11877 			VERIFY(len <= sizeof(sumdata));
11878 			c = buf + i;
11879 			bcopy(sumdata, c, len);
11880 
11881 			/* Zero-offset test (align by data pointer) */
11882 			m->m_data = (uintptr_t)c;
11883 			m->m_len = len;
11884 			sum = m_sum16(m, 0, len);
11885 
11886 			if (!sumtbl[n].init) {
11887 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11888 				sumtbl[n].sumr = sumr;
11889 				sumtbl[n].init = TRUE;
11890 			} else {
11891 				sumr = sumtbl[n].sumr;
11892 			}
11893 
11894 			/* Something is horribly broken; stop now */
11895 			if (sumr != sumtbl[n].sumrp) {
11896 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11897 				    "for len=%d align=%d sum=0x%04x "
11898 				    "[expected=0x%04x]\n", __func__,
11899 				    len, i, sum, sumr);
11900 				/* NOTREACHED */
11901 			} else if (sum != sumr) {
11902 				panic_plain("\n%s: broken m_sum16() for len=%d "
11903 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11904 				    __func__, len, i, sum, sumr);
11905 				/* NOTREACHED */
11906 			}
11907 
11908 			/* Alignment test by offset (fixed data pointer) */
11909 			m->m_data = (uintptr_t)buf;
11910 			m->m_len = i + len;
11911 			sum = m_sum16(m, i, len);
11912 
11913 			/* Something is horribly broken; stop now */
11914 			if (sum != sumr) {
11915 				panic_plain("\n%s: broken m_sum16() for len=%d "
11916 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
11917 				    __func__, len, i, sum, sumr);
11918 				/* NOTREACHED */
11919 			}
11920 #if INET
11921 			/* Simple sum16 contiguous buffer test by aligment */
11922 			sum = b_sum16(c, len);
11923 
11924 			/* Something is horribly broken; stop now */
11925 			if (sum != sumr) {
11926 				panic_plain("\n%s: broken b_sum16() for len=%d "
11927 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11928 				    __func__, len, i, sum, sumr);
11929 				/* NOTREACHED */
11930 			}
11931 #endif /* INET */
11932 		}
11933 	}
11934 	m_freem(m);
11935 
11936 	kprintf("PASSED\n");
11937 }
11938 #endif /* DEBUG || DEVELOPMENT */
11939 
11940 #define CASE_STRINGIFY(x) case x: return #x
11941 
11942 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11943 dlil_kev_dl_code_str(u_int32_t event_code)
11944 {
11945 	switch (event_code) {
11946 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11947 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11948 		CASE_STRINGIFY(KEV_DL_SIFMTU);
11949 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
11950 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11951 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11952 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
11953 		CASE_STRINGIFY(KEV_DL_DELMULTI);
11954 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11955 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11956 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11957 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
11958 		CASE_STRINGIFY(KEV_DL_LINK_ON);
11959 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11960 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11961 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11962 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11963 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11964 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11965 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11966 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11967 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11968 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11969 		CASE_STRINGIFY(KEV_DL_ISSUES);
11970 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11971 	default:
11972 		break;
11973 	}
11974 	return "";
11975 }
11976 
11977 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11978 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11979 {
11980 #pragma unused(arg1)
11981 	struct ifnet *ifp = arg0;
11982 
11983 	if (ifnet_is_attached(ifp, 1)) {
11984 		nstat_ifnet_threshold_reached(ifp->if_index);
11985 		ifnet_decr_iorefcnt(ifp);
11986 	}
11987 }
11988 
11989 void
ifnet_notify_data_threshold(struct ifnet * ifp)11990 ifnet_notify_data_threshold(struct ifnet *ifp)
11991 {
11992 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11993 	uint64_t oldbytes = ifp->if_dt_bytes;
11994 
11995 	ASSERT(ifp->if_dt_tcall != NULL);
11996 
11997 	/*
11998 	 * If we went over the threshold, notify NetworkStatistics.
11999 	 * We rate-limit it based on the threshold interval value.
12000 	 */
12001 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12002 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12003 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12004 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12005 		uint64_t now = mach_absolute_time(), deadline = now;
12006 		uint64_t ival;
12007 
12008 		if (tival != 0) {
12009 			nanoseconds_to_absolutetime(tival, &ival);
12010 			clock_deadline_for_periodic_event(ival, now, &deadline);
12011 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12012 			    deadline);
12013 		} else {
12014 			(void) thread_call_enter(ifp->if_dt_tcall);
12015 		}
12016 	}
12017 }
12018 
12019 
12020 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12021 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12022     struct ifnet *ifp)
12023 {
12024 	tcp_update_stats_per_flow(ifs, ifp);
12025 }
12026 
12027 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12028 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12029 {
12030 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12031 }
12032 
12033 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12034 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12035 {
12036 	OSBitAndAtomic(~clear_flags, flags_p);
12037 }
12038 
12039 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12040 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12041 {
12042 	return _set_flags(&interface->if_eflags, set_flags);
12043 }
12044 
12045 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12046 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12047 {
12048 	_clear_flags(&interface->if_eflags, clear_flags);
12049 }
12050 
12051 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12052 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12053 {
12054 	return _set_flags(&interface->if_xflags, set_flags);
12055 }
12056 
12057 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12058 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12059 {
12060 	_clear_flags(&interface->if_xflags, clear_flags);
12061 }
12062 
12063 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12064 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12065 {
12066 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12067 }
12068 
12069 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12070 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12071 {
12072 	if (*genid != ifp->if_traffic_rule_genid) {
12073 		*genid = ifp->if_traffic_rule_genid;
12074 		return TRUE;
12075 	}
12076 	return FALSE;
12077 }
12078 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12079 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12080 {
12081 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12082 	ifnet_update_traffic_rule_genid(ifp);
12083 }
12084 
12085 static void
log_hexdump(void * data,size_t len)12086 log_hexdump(void *data, size_t len)
12087 {
12088 	size_t i, j, k;
12089 	unsigned char *ptr = (unsigned char *)data;
12090 #define MAX_DUMP_BUF 32
12091 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12092 
12093 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12094 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12095 			unsigned char msnbl = ptr[j] >> 4;
12096 			unsigned char lsnbl = ptr[j] & 0x0f;
12097 
12098 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12099 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12100 
12101 			if ((j % 2) == 1) {
12102 				buf[k++] = ' ';
12103 			}
12104 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12105 				buf[k++] = ' ';
12106 			}
12107 		}
12108 		buf[k] = 0;
12109 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12110 	}
12111 }
12112 
12113 #if SKYWALK
12114 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12115 net_check_compatible_if_filter(struct ifnet *ifp)
12116 {
12117 	if (ifp == NULL) {
12118 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12119 			return false;
12120 		}
12121 	} else {
12122 		if (ifp->if_flt_non_os_count > 0) {
12123 			return false;
12124 		}
12125 	}
12126 	return true;
12127 }
12128 #endif /* SKYWALK */
12129 
12130 #define DUMP_BUF_CHK() {        \
12131 	clen -= k;              \
12132 	if (clen < 1)           \
12133 	        goto done;      \
12134 	c += k;                 \
12135 }
12136 
12137 int dlil_dump_top_if_qlen(char *, int);
12138 int
dlil_dump_top_if_qlen(char * str,int str_len)12139 dlil_dump_top_if_qlen(char *str, int str_len)
12140 {
12141 	char *c = str;
12142 	int k, clen = str_len;
12143 	struct ifnet *top_ifcq_ifp = NULL;
12144 	uint32_t top_ifcq_len = 0;
12145 	struct ifnet *top_inq_ifp = NULL;
12146 	uint32_t top_inq_len = 0;
12147 
12148 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12149 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12150 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12151 
12152 		if (ifp == NULL) {
12153 			continue;
12154 		}
12155 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12156 			top_ifcq_len = ifp->if_snd->ifcq_len;
12157 			top_ifcq_ifp = ifp;
12158 		}
12159 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12160 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12161 			top_inq_ifp = ifp;
12162 		}
12163 	}
12164 
12165 	if (top_ifcq_ifp != NULL) {
12166 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12167 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12168 		DUMP_BUF_CHK();
12169 	}
12170 	if (top_inq_ifp != NULL) {
12171 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12172 		    top_inq_len, top_inq_ifp->if_xname);
12173 		DUMP_BUF_CHK();
12174 	}
12175 done:
12176 	return str_len - clen;
12177 }
12178