xref: /xnu-8019.80.24/bsd/net/dlil.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 1999-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR        4 /* LONGWORDS */
154 #define M_NKE M_IFADDR
155 
156 #if 1
157 #define DLIL_PRINTF     printf
158 #else
159 #define DLIL_PRINTF     kprintf
160 #endif
161 
162 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
163 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
164 
165 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
166 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
167 
168 enum {
169 	kProtoKPI_v1    = 1,
170 	kProtoKPI_v2    = 2
171 };
172 
173 /*
174  * List of if_proto structures in if_proto_hash[] is protected by
175  * the ifnet lock.  The rest of the fields are initialized at protocol
176  * attach time and never change, thus no lock required as long as
177  * a reference to it is valid, via if_proto_ref().
178  */
179 struct if_proto {
180 	SLIST_ENTRY(if_proto)       next_hash;
181 	u_int32_t                   refcount;
182 	u_int32_t                   detached;
183 	struct ifnet                *ifp;
184 	protocol_family_t           protocol_family;
185 	int                         proto_kpi;
186 	union {
187 		struct {
188 			proto_media_input               input;
189 			proto_media_preout              pre_output;
190 			proto_media_event               event;
191 			proto_media_ioctl               ioctl;
192 			proto_media_detached            detached;
193 			proto_media_resolve_multi       resolve_multi;
194 			proto_media_send_arp            send_arp;
195 		} v1;
196 		struct {
197 			proto_media_input_v2            input;
198 			proto_media_preout              pre_output;
199 			proto_media_event               event;
200 			proto_media_ioctl               ioctl;
201 			proto_media_detached            detached;
202 			proto_media_resolve_multi       resolve_multi;
203 			proto_media_send_arp            send_arp;
204 		} v2;
205 	} kpi;
206 };
207 
208 SLIST_HEAD(proto_hash_entry, if_proto);
209 
210 #define DLIL_SDLDATALEN \
211 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
212 
213 struct dlil_ifnet {
214 	struct ifnet    dl_if;                  /* public ifnet */
215 	/*
216 	 * DLIL private fields, protected by dl_if_lock
217 	 */
218 	decl_lck_mtx_data(, dl_if_lock);
219 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
220 	u_int32_t dl_if_flags;                  /* flags (below) */
221 	u_int32_t dl_if_refcnt;                 /* refcnt */
222 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
223 	void    *dl_if_uniqueid;                /* unique interface id */
224 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
225 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
226 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
227 	struct {
228 		struct ifaddr   ifa;            /* lladdr ifa */
229 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
230 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
231 	} dl_if_lladdr;
232 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
233 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
234 	u_int8_t dl_if_permanent_ether_is_set;
235 	u_int8_t dl_if_unused;
236 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
237 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
238 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
239 };
240 
241 /* Values for dl_if_flags (private to DLIL) */
242 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
243 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
244 #define DLIF_DEBUG      0x4     /* has debugging info */
245 
246 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
247 
248 /* For gdb */
249 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
250 
251 struct dlil_ifnet_dbg {
252 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
253 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
254 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
255 	/*
256 	 * Circular lists of ifnet_{reference,release} callers.
257 	 */
258 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
259 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
260 };
261 
262 #define DLIL_TO_IFP(s)  (&s->dl_if)
263 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
264 
265 struct ifnet_filter {
266 	TAILQ_ENTRY(ifnet_filter)       filt_next;
267 	u_int32_t                       filt_skip;
268 	u_int32_t                       filt_flags;
269 	ifnet_t                         filt_ifp;
270 	const char                      *filt_name;
271 	void                            *filt_cookie;
272 	protocol_family_t               filt_protocol;
273 	iff_input_func                  filt_input;
274 	iff_output_func                 filt_output;
275 	iff_event_func                  filt_event;
276 	iff_ioctl_func                  filt_ioctl;
277 	iff_detached_func               filt_detached;
278 };
279 
280 struct proto_input_entry;
281 
282 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
283 
284 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
285 
286 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
287 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
288 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
289 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
290 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
291 
292 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
293 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
294     &dlil_lck_attributes);
295 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
296     &dlil_lck_attributes);
297 
298 #if DEBUG
299 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
300 #else
301 static unsigned int ifnet_debug;        /* debugging (disabled) */
302 #endif /* !DEBUG */
303 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
304 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
305 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
306 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
307 
308 static ZONE_DECLARE(dlif_filt_zone, "ifnet_filter",
309     sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
310 
311 static ZONE_DECLARE(dlif_phash_zone, "ifnet_proto_hash",
312     sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
313 
314 static ZONE_DECLARE(dlif_proto_zone, "ifnet_proto",
315     sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
316 
317 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
318 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
319 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
320 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
321 
322 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
323 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
324 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
325 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
326 
327 static u_int32_t net_rtref;
328 
329 static struct dlil_main_threading_info dlil_main_input_thread_info;
330 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
331     (struct dlil_threading_info *)&dlil_main_input_thread_info;
332 
333 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
334 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
335 static void dlil_if_trace(struct dlil_ifnet *, int);
336 static void if_proto_ref(struct if_proto *);
337 static void if_proto_free(struct if_proto *);
338 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
339 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
340     u_int32_t list_count);
341 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
342 static void if_flt_monitor_busy(struct ifnet *);
343 static void if_flt_monitor_unbusy(struct ifnet *);
344 static void if_flt_monitor_enter(struct ifnet *);
345 static void if_flt_monitor_leave(struct ifnet *);
346 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
347     char **, protocol_family_t);
348 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
349     protocol_family_t);
350 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
351     const struct sockaddr_dl *);
352 static int ifnet_lookup(struct ifnet *);
353 static void if_purgeaddrs(struct ifnet *);
354 
355 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
356     struct mbuf *, char *);
357 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
358     struct mbuf *);
359 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
360     mbuf_t *, const struct sockaddr *, void *, char *, char *);
361 static void ifproto_media_event(struct ifnet *, protocol_family_t,
362     const struct kev_msg *);
363 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
364     unsigned long, void *);
365 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
366     struct sockaddr_dl *, size_t);
367 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
368     const struct sockaddr_dl *, const struct sockaddr *,
369     const struct sockaddr_dl *, const struct sockaddr *);
370 
371 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
372     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
373     boolean_t poll, struct thread *tp);
374 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
375     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
376 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
377 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
378     protocol_family_t *);
379 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
380     const struct ifnet_demux_desc *, u_int32_t);
381 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
382 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
383 #if !XNU_TARGET_OS_OSX
384 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
385     const struct sockaddr *, const char *, const char *,
386     u_int32_t *, u_int32_t *);
387 #else /* XNU_TARGET_OS_OSX */
388 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
389     const struct sockaddr *, const char *, const char *);
390 #endif /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
392     const struct sockaddr *, const char *, const char *,
393     u_int32_t *, u_int32_t *);
394 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
395 static void ifp_if_free(struct ifnet *);
396 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
397 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
398 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
399 
400 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
401     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
402     boolean_t, struct thread *);
403 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
404     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405     boolean_t, struct thread *);
406 
407 static void dlil_main_input_thread_func(void *, wait_result_t);
408 static void dlil_main_input_thread_cont(void *, wait_result_t);
409 
410 static void dlil_input_thread_func(void *, wait_result_t);
411 static void dlil_input_thread_cont(void *, wait_result_t);
412 
413 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
414 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
415 
416 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
417     thread_continue_t *);
418 static void dlil_terminate_input_thread(struct dlil_threading_info *);
419 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
420     struct dlil_threading_info *, struct ifnet *, boolean_t);
421 static boolean_t dlil_input_stats_sync(struct ifnet *,
422     struct dlil_threading_info *);
423 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
424     u_int32_t, ifnet_model_t, boolean_t);
425 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
426     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
427 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
428 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
429 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
430 #if DEBUG || DEVELOPMENT
431 static void dlil_verify_sum16(void);
432 #endif /* DEBUG || DEVELOPMENT */
433 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
434     protocol_family_t);
435 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
436     protocol_family_t);
437 
438 static void dlil_incr_pending_thread_count(void);
439 static void dlil_decr_pending_thread_count(void);
440 
441 static void ifnet_detacher_thread_func(void *, wait_result_t);
442 static void ifnet_detacher_thread_cont(void *, wait_result_t);
443 static void ifnet_detach_final(struct ifnet *);
444 static void ifnet_detaching_enqueue(struct ifnet *);
445 static struct ifnet *ifnet_detaching_dequeue(void);
446 
447 static void ifnet_start_thread_func(void *, wait_result_t);
448 static void ifnet_start_thread_cont(void *, wait_result_t);
449 
450 static void ifnet_poll_thread_func(void *, wait_result_t);
451 static void ifnet_poll_thread_cont(void *, wait_result_t);
452 
453 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
454     classq_pkt_t *, boolean_t, boolean_t *);
455 
456 static void ifp_src_route_copyout(struct ifnet *, struct route *);
457 static void ifp_src_route_copyin(struct ifnet *, struct route *);
458 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
459 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
460 
461 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
467 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
468 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
470 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
471 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
472 
473 struct chain_len_stats tx_chain_len_stats;
474 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
475 
476 #if TEST_INPUT_THREAD_TERMINATION
477 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
478 #endif /* TEST_INPUT_THREAD_TERMINATION */
479 
480 /* The following are protected by dlil_ifnet_lock */
481 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
482 static u_int32_t ifnet_detaching_cnt;
483 static boolean_t ifnet_detaching_embryonic;
484 static void *ifnet_delayed_run; /* wait channel for detaching thread */
485 
486 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
487     &dlil_lck_attributes);
488 
489 static uint32_t ifnet_flowhash_seed;
490 
491 struct ifnet_flowhash_key {
492 	char            ifk_name[IFNAMSIZ];
493 	uint32_t        ifk_unit;
494 	uint32_t        ifk_flags;
495 	uint32_t        ifk_eflags;
496 	uint32_t        ifk_capabilities;
497 	uint32_t        ifk_capenable;
498 	uint32_t        ifk_output_sched_model;
499 	uint32_t        ifk_rand1;
500 	uint32_t        ifk_rand2;
501 };
502 
503 /* Flow control entry per interface */
504 struct ifnet_fc_entry {
505 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
506 	u_int32_t       ifce_flowhash;
507 	struct ifnet    *ifce_ifp;
508 };
509 
510 static uint32_t ifnet_calc_flowhash(struct ifnet *);
511 static int ifce_cmp(const struct ifnet_fc_entry *,
512     const struct ifnet_fc_entry *);
513 static int ifnet_fc_add(struct ifnet *);
514 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
515 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
516 
517 /* protected by ifnet_fc_lock */
518 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
519 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
521 
522 static ZONE_DECLARE(ifnet_fc_zone, "ifnet_fc_zone",
523     sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
524 
525 extern void bpfdetach(struct ifnet *);
526 extern void proto_input_run(void);
527 
528 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
531     u_int32_t flags);
532 
533 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
534 
535 #if CONFIG_MACF
536 #if !XNU_TARGET_OS_OSX
537 int dlil_lladdr_ckreq = 1;
538 #else /* XNU_TARGET_OS_OSX */
539 int dlil_lladdr_ckreq = 0;
540 #endif /* XNU_TARGET_OS_OSX */
541 #endif /* CONFIG_MACF */
542 
543 #if DEBUG
544 int dlil_verbose = 1;
545 #else
546 int dlil_verbose = 0;
547 #endif /* DEBUG */
548 #if IFNET_INPUT_SANITY_CHK
549 /* sanity checking of input packet lists received */
550 static u_int32_t dlil_input_sanity_check = 0;
551 #endif /* IFNET_INPUT_SANITY_CHK */
552 /* rate limit debug messages */
553 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
554 
555 SYSCTL_DECL(_net_link_generic_system);
556 
557 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
558     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
559 
560 #define IF_SNDQ_MINLEN  32
561 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
562 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
563     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
564     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
565 
566 #define IF_RCVQ_MINLEN  32
567 #define IF_RCVQ_MAXLEN  256
568 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
569 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
570     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
571     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
572 
573 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
574 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
575 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
576     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
577     "ilog2 of EWMA decay rate of avg inbound packets");
578 
579 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
580 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
581 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
582 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
583     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
584     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
585     "Q", "input poll mode freeze time");
586 
587 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
588 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
589 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
590 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
591     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
592     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
593     "Q", "input poll sampling time");
594 
595 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
596 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
597     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
598     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
599     "Q", "input poll interval (time)");
600 
601 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
602 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
603 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
604     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
605     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
606 
607 #define IF_RXPOLL_WLOWAT        10
608 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
609 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
610     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
611     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
612     "I", "input poll wakeup low watermark");
613 
614 #define IF_RXPOLL_WHIWAT        100
615 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
616 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
617     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
618     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
619     "I", "input poll wakeup high watermark");
620 
621 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
622 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
623     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
624     "max packets per poll call");
625 
626 u_int32_t if_rxpoll = 1;
627 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
628     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
629     sysctl_rxpoll, "I", "enable opportunistic input polling");
630 
631 #if TEST_INPUT_THREAD_TERMINATION
632 static u_int32_t if_input_thread_termination_spin = 0;
633 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
634     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
635     &if_input_thread_termination_spin, 0,
636     sysctl_input_thread_termination_spin,
637     "I", "input thread termination spin limit");
638 #endif /* TEST_INPUT_THREAD_TERMINATION */
639 
640 static u_int32_t cur_dlil_input_threads = 0;
641 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
642     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
643     "Current number of DLIL input threads");
644 
645 #if IFNET_INPUT_SANITY_CHK
646 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
647     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
648     "Turn on sanity checking in DLIL input");
649 #endif /* IFNET_INPUT_SANITY_CHK */
650 
651 static u_int32_t if_flowadv = 1;
652 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
653     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
654     "enable flow-advisory mechanism");
655 
656 static u_int32_t if_delaybased_queue = 1;
657 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
658     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
659     "enable delay based dynamic queue sizing");
660 
661 static uint64_t hwcksum_in_invalidated = 0;
662 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
663     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
664     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
665 
666 uint32_t hwcksum_dbg = 0;
667 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
668     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
669     "enable hardware cksum debugging");
670 
671 u_int32_t ifnet_start_delayed = 0;
672 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
673     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
674     "number of times start was delayed");
675 
676 u_int32_t ifnet_delay_start_disabled = 0;
677 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
678     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
679     "number of times start was delayed");
680 
681 static inline void
ifnet_delay_start_disabled_increment(void)682 ifnet_delay_start_disabled_increment(void)
683 {
684 	OSIncrementAtomic(&ifnet_delay_start_disabled);
685 }
686 
687 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
688 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
689 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
690 #define HWCKSUM_DBG_MASK \
691 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
692 	HWCKSUM_DBG_FINALIZE_FORCED)
693 
694 static uint32_t hwcksum_dbg_mode = 0;
695 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
696     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
697     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
698 
699 static uint64_t hwcksum_dbg_partial_forced = 0;
700 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
701     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
702     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
703 
704 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
705 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
706     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
707     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
708 
709 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
710 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
711     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
712     &hwcksum_dbg_partial_rxoff_forced, 0,
713     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
714     "forced partial cksum rx offset");
715 
716 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
717 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
718     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
719     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
720     "adjusted partial cksum rx offset");
721 
722 static uint64_t hwcksum_dbg_verified = 0;
723 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
724     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
725     &hwcksum_dbg_verified, "packets verified for having good checksum");
726 
727 static uint64_t hwcksum_dbg_bad_cksum = 0;
728 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
729     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
730     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
731 
732 static uint64_t hwcksum_dbg_bad_rxoff = 0;
733 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
734     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
735     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
736 
737 static uint64_t hwcksum_dbg_adjusted = 0;
738 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
739     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
740     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
741 
742 static uint64_t hwcksum_dbg_finalized_hdr = 0;
743 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
744     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
745     &hwcksum_dbg_finalized_hdr, "finalized headers");
746 
747 static uint64_t hwcksum_dbg_finalized_data = 0;
748 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
749     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
750     &hwcksum_dbg_finalized_data, "finalized payloads");
751 
752 uint32_t hwcksum_tx = 1;
753 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
754     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
755     "enable transmit hardware checksum offload");
756 
757 uint32_t hwcksum_rx = 1;
758 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
759     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
760     "enable receive hardware checksum offload");
761 
762 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
763     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
764     sysctl_tx_chain_len_stats, "S", "");
765 
766 uint32_t tx_chain_len_count = 0;
767 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
768     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
769 
770 static uint32_t threshold_notify = 1;           /* enable/disable */
771 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
772     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
773 
774 static uint32_t threshold_interval = 2;         /* in seconds */
775 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
776     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
777 
778 #if (DEVELOPMENT || DEBUG)
779 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
780 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
781     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
782 #endif /* DEVELOPMENT || DEBUG */
783 
784 struct net_api_stats net_api_stats;
785 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
786     &net_api_stats, net_api_stats, "");
787 
788 uint32_t net_wake_pkt_debug = 0;
789 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
790     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
791 
792 static void log_hexdump(void *data, size_t len);
793 
794 unsigned int net_rxpoll = 1;
795 unsigned int net_affinity = 1;
796 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
797 
798 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
799 
800 extern u_int32_t        inject_buckets;
801 
802 /* DLIL data threshold thread call */
803 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
804 
805 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)806 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
807 {
808 	/*
809 	 * update filter count and route_generation ID to let TCP
810 	 * know it should reevalute doing TSO or not
811 	 */
812 	if (filter_enable) {
813 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
814 	} else {
815 		VERIFY(ifp->if_flt_no_tso_count != 0);
816 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
817 	}
818 	routegenid_update();
819 }
820 
821 #if SKYWALK
822 
823 #if defined(XNU_TARGET_OS_OSX)
824 static bool net_check_compatible_if_filter(struct ifnet *ifp);
825 #endif /* XNU_TARGET_OS_OSX */
826 
827 /* if_attach_nx flags defined in os_skywalk_private.h */
828 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
829 unsigned int if_enable_fsw_ip_netagent =
830     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
831 unsigned int if_enable_fsw_transport_netagent =
832     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
833 
834 unsigned int if_netif_all =
835     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
836 
837 /* Configure flowswitch to use max mtu sized buffer */
838 static bool fsw_use_max_mtu_buffer = false;
839 
840 #if (DEVELOPMENT || DEBUG)
841 static int
842 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
843 {
844 #pragma unused(oidp, arg1, arg2)
845 	unsigned int new_value;
846 	int changed;
847 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
848 	    &new_value, &changed);
849 	if (error) {
850 		return error;
851 	}
852 	if (changed) {
853 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
854 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
855 			return ENOTSUP;
856 		}
857 		if_attach_nx = new_value;
858 	}
859 	return 0;
860 }
861 
862 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
863     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
864     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
865 
866 #endif /* DEVELOPMENT || DEBUG */
867 
868 static int
869 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
870 {
871 #pragma unused(oidp, arg1, arg2)
872 	unsigned int new_value;
873 	int changed;
874 	int error;
875 
876 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
877 	    sizeof(if_enable_fsw_transport_netagent),
878 	    &new_value, &changed);
879 	if (error == 0 && changed != 0) {
880 		if (new_value != 0 && new_value != 1) {
881 			/* only allow 0 or 1 */
882 			error = EINVAL;
883 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
884 			/* netagent can be enabled/disabled */
885 			if_enable_fsw_transport_netagent = new_value;
886 			if (new_value == 0) {
887 				kern_nexus_deregister_netagents();
888 			} else {
889 				kern_nexus_register_netagents();
890 			}
891 		} else {
892 			/* netagent can't be enabled */
893 			error = ENOTSUP;
894 		}
895 	}
896 	return error;
897 }
898 
899 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
900     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
901     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
902     "enable flowswitch netagent");
903 
904 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
905 
906 #include <skywalk/os_skywalk_private.h>
907 
908 boolean_t
ifnet_nx_noauto(ifnet_t ifp)909 ifnet_nx_noauto(ifnet_t ifp)
910 {
911 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
912 }
913 
914 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)915 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
916 {
917 	return ifnet_is_low_latency(ifp);
918 }
919 
920 boolean_t
ifnet_is_low_latency(ifnet_t ifp)921 ifnet_is_low_latency(ifnet_t ifp)
922 {
923 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
924 }
925 
926 boolean_t
ifnet_needs_compat(ifnet_t ifp)927 ifnet_needs_compat(ifnet_t ifp)
928 {
929 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
930 		return FALSE;
931 	}
932 #if !XNU_TARGET_OS_OSX
933 	/*
934 	 * To conserve memory, we plumb in the compat layer selectively; this
935 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
936 	 * In particular, we check for Wi-Fi Access Point.
937 	 */
938 	if (IFNET_IS_WIFI(ifp)) {
939 		/* Wi-Fi Access Point */
940 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
941 		    ifp->if_name[2] == '\0') {
942 			return if_netif_all;
943 		}
944 	}
945 #else /* XNU_TARGET_OS_OSX */
946 #pragma unused(ifp)
947 #endif /* XNU_TARGET_OS_OSX */
948 	return TRUE;
949 }
950 
951 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)952 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
953 {
954 	if (if_is_fsw_transport_netagent_enabled()) {
955 		/* check if netagent has been manually enabled for ipsec/utun */
956 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
957 			return ipsec_interface_needs_netagent(ifp);
958 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
959 			return utun_interface_needs_netagent(ifp);
960 		}
961 
962 		/* check ifnet no auto nexus override */
963 		if (ifnet_nx_noauto(ifp)) {
964 			return FALSE;
965 		}
966 
967 		/* check global if_attach_nx configuration */
968 		switch (ifp->if_family) {
969 		case IFNET_FAMILY_CELLULAR:
970 		case IFNET_FAMILY_ETHERNET:
971 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
972 				return TRUE;
973 			}
974 			break;
975 		default:
976 			break;
977 		}
978 	}
979 	return FALSE;
980 }
981 
982 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)983 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
984 {
985 #pragma unused(ifp)
986 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
987 		return TRUE;
988 	}
989 	return FALSE;
990 }
991 
992 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)993 ifnet_needs_netif_netagent(ifnet_t ifp)
994 {
995 #pragma unused(ifp)
996 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
997 }
998 
999 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1000 dlil_detach_nexus_instance(nexus_controller_t controller,
1001     const char *func_str, uuid_t instance, uuid_t device)
1002 {
1003 	errno_t         err;
1004 
1005 	if (instance == NULL || uuid_is_null(instance)) {
1006 		return FALSE;
1007 	}
1008 
1009 	/* followed by the device port */
1010 	if (device != NULL && !uuid_is_null(device)) {
1011 		err = kern_nexus_ifdetach(controller, instance, device);
1012 		if (err != 0) {
1013 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1014 			    func_str, err);
1015 		}
1016 	}
1017 	err = kern_nexus_controller_free_provider_instance(controller,
1018 	    instance);
1019 	if (err != 0) {
1020 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1021 		    func_str, err);
1022 	}
1023 	return TRUE;
1024 }
1025 
1026 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1027 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1028     uuid_t device)
1029 {
1030 	boolean_t               detached = FALSE;
1031 	nexus_controller_t      controller = kern_nexus_shared_controller();
1032 	int                     err;
1033 
1034 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1035 	    device)) {
1036 		detached = TRUE;
1037 	}
1038 	if (provider != NULL && !uuid_is_null(provider)) {
1039 		detached = TRUE;
1040 		err = kern_nexus_controller_deregister_provider(controller,
1041 		    provider);
1042 		if (err != 0) {
1043 			DLIL_PRINTF("%s deregister_provider %d\n",
1044 			    func_str, err);
1045 		}
1046 	}
1047 	return detached;
1048 }
1049 
1050 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1051 dlil_create_provider_and_instance(nexus_controller_t controller,
1052     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1053     nexus_attr_t attr)
1054 {
1055 	uuid_t          dom_prov;
1056 	errno_t         err;
1057 	nexus_name_t    provider_name;
1058 	const char      *type_name =
1059 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1060 	struct kern_nexus_init init;
1061 
1062 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1063 	if (err != 0) {
1064 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1065 		    __func__, type_name, err);
1066 		goto failed;
1067 	}
1068 
1069 	snprintf((char *)provider_name, sizeof(provider_name),
1070 	    "com.apple.%s.%s", type_name, if_name(ifp));
1071 	err = kern_nexus_controller_register_provider(controller,
1072 	    dom_prov,
1073 	    provider_name,
1074 	    NULL,
1075 	    0,
1076 	    attr,
1077 	    provider);
1078 	if (err != 0) {
1079 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1080 		    __func__, type_name, err);
1081 		goto failed;
1082 	}
1083 	bzero(&init, sizeof(init));
1084 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1085 	err = kern_nexus_controller_alloc_provider_instance(controller,
1086 	    *provider,
1087 	    NULL, NULL,
1088 	    instance, &init);
1089 	if (err != 0) {
1090 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1091 		    __func__, type_name, err);
1092 		kern_nexus_controller_deregister_provider(controller,
1093 		    *provider);
1094 		goto failed;
1095 	}
1096 failed:
1097 	return err;
1098 }
1099 
1100 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1101 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1102 {
1103 	nexus_attr_t            attr = NULL;
1104 	nexus_controller_t      controller;
1105 	errno_t                 err;
1106 
1107 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1108 		/* it's already attached */
1109 		if (dlil_verbose) {
1110 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1111 			    __func__, if_name(ifp));
1112 			/* already attached */
1113 		}
1114 		goto failed;
1115 	}
1116 
1117 	err = kern_nexus_attr_create(&attr);
1118 	if (err != 0) {
1119 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1120 		    if_name(ifp));
1121 		goto failed;
1122 	}
1123 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1124 	VERIFY(err == 0);
1125 
1126 	controller = kern_nexus_shared_controller();
1127 
1128 	/* create the netif provider and instance */
1129 	err = dlil_create_provider_and_instance(controller,
1130 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1131 	    &netif_nx->if_nif_instance, attr);
1132 	if (err != 0) {
1133 		goto failed;
1134 	}
1135 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1136 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1137 	if (err != 0) {
1138 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1139 		    __func__, err);
1140 		/* cleanup provider and instance */
1141 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1142 		    netif_nx->if_nif_instance, NULL);
1143 		goto failed;
1144 	}
1145 	return TRUE;
1146 
1147 failed:
1148 	if (attr != NULL) {
1149 		kern_nexus_attr_destroy(attr);
1150 	}
1151 	return FALSE;
1152 }
1153 
1154 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1155 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1156 {
1157 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1158 	    IFNET_IS_VMNET(ifp)) {
1159 		goto failed;
1160 	}
1161 	switch (ifp->if_type) {
1162 	case IFT_CELLULAR:
1163 	case IFT_ETHER:
1164 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1165 			/* don't auto-attach */
1166 			goto failed;
1167 		}
1168 		break;
1169 	default:
1170 		/* don't auto-attach */
1171 		goto failed;
1172 	}
1173 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1174 
1175 failed:
1176 	return FALSE;
1177 }
1178 
1179 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1180 dlil_is_native_netif_nexus(ifnet_t ifp)
1181 {
1182 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1183 }
1184 
1185 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1186 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1187 {
1188 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1189 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1190 }
1191 
1192 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1193 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1194 {
1195 	struct ifreq        ifr;
1196 	int                 error;
1197 
1198 	bzero(&ifr, sizeof(ifr));
1199 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1200 	if (error == 0) {
1201 		*ifdm_p = ifr.ifr_devmtu;
1202 	}
1203 	return error;
1204 }
1205 
1206 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint64_t * buf_size,bool * use_multi_buflet)1207 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint64_t *buf_size,
1208     bool *use_multi_buflet)
1209 {
1210 	struct kern_pbufpool_memory_info rx_pp_info;
1211 	struct kern_pbufpool_memory_info tx_pp_info;
1212 	uint32_t if_max_mtu = 0;
1213 	uint32_t drv_buf_size;
1214 	struct ifdevmtu ifdm;
1215 	int err;
1216 
1217 	/*
1218 	 * To perform intra-stack RX aggregation flowswitch needs to use
1219 	 * multi-buflet packet.
1220 	 */
1221 	*use_multi_buflet = (sk_fsw_rx_agg_tcp != 0);
1222 
1223 	/*
1224 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1225 	 * but the driver advertises the MAX MTU as only 9K.
1226 	 */
1227 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1228 		if_max_mtu = IP_MAXPACKET;
1229 		goto skip_mtu_ioctl;
1230 	}
1231 
1232 	/* determine max mtu */
1233 	bzero(&ifdm, sizeof(ifdm));
1234 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1235 	if (__improbable(err != 0)) {
1236 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1237 		    __func__, if_name(ifp));
1238 		/* use default flowswitch buffer size */
1239 		if_max_mtu = NX_FSW_BUFSIZE;
1240 	} else {
1241 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1242 		    ifdm.ifdm_max, ifdm.ifdm_current);
1243 		/* rdar://problem/44589731 */
1244 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1245 	}
1246 
1247 skip_mtu_ioctl:
1248 	if (if_max_mtu == 0) {
1249 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1250 		    __func__, if_name(ifp));
1251 		return EINVAL;
1252 	}
1253 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1254 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1255 		    "max bufsize(%d)\n", __func__,
1256 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1257 		return EINVAL;
1258 	}
1259 
1260 	/*
1261 	 * for skywalk native driver, consult the driver packet pool also.
1262 	 */
1263 	if (dlil_is_native_netif_nexus(ifp)) {
1264 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1265 		    &tx_pp_info);
1266 		if (err != 0) {
1267 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1268 			    __func__, if_name(ifp));
1269 			return ENXIO;
1270 		}
1271 		drv_buf_size = tx_pp_info.kpm_bufsize *
1272 		    tx_pp_info.kpm_max_frags;
1273 		if (if_max_mtu > drv_buf_size) {
1274 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1275 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1276 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1277 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1278 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1279 			return EINVAL;
1280 		}
1281 	} else {
1282 		drv_buf_size = if_max_mtu;
1283 	}
1284 
1285 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1286 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1287 		*use_multi_buflet = true;
1288 		/* default flowswitch buffer size */
1289 		*buf_size = NX_FSW_BUFSIZE;
1290 	} else {
1291 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1292 	}
1293 	return 0;
1294 }
1295 
1296 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1297 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1298 {
1299 	nexus_attr_t            attr = NULL;
1300 	nexus_controller_t      controller;
1301 	errno_t                 err = 0;
1302 	uuid_t                  netif;
1303 	uint64_t                buf_size = 0;
1304 	bool                    multi_buflet;
1305 
1306 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1307 	    IFNET_IS_VMNET(ifp)) {
1308 		goto failed;
1309 	}
1310 
1311 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1312 		/* not possible to attach (netif native/compat not plumbed) */
1313 		goto failed;
1314 	}
1315 
1316 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1317 		/* don't auto-attach */
1318 		goto failed;
1319 	}
1320 
1321 	/* get the netif instance from the ifp */
1322 	err = kern_nexus_get_netif_instance(ifp, netif);
1323 	if (err != 0) {
1324 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1325 		    if_name(ifp));
1326 		goto failed;
1327 	}
1328 
1329 	err = kern_nexus_attr_create(&attr);
1330 	if (err != 0) {
1331 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1332 		    if_name(ifp));
1333 		goto failed;
1334 	}
1335 
1336 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1337 	    &multi_buflet);
1338 	if (err != 0) {
1339 		goto failed;
1340 	}
1341 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1342 
1343 	/* Configure flowswitch buffer size */
1344 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1345 	VERIFY(err == 0);
1346 
1347 	/*
1348 	 * Configure flowswitch to use super-packet (multi-buflet).
1349 	 */
1350 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1351 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1352 	VERIFY(err == 0);
1353 
1354 	/* create the flowswitch provider and instance */
1355 	controller = kern_nexus_shared_controller();
1356 	err = dlil_create_provider_and_instance(controller,
1357 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1358 	    &nexus_fsw->if_fsw_instance, attr);
1359 	if (err != 0) {
1360 		goto failed;
1361 	}
1362 
1363 	/* attach the device port */
1364 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1365 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1366 	if (err != 0) {
1367 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1368 		    __func__, err, if_name(ifp));
1369 		/* cleanup provider and instance */
1370 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1371 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1372 		goto failed;
1373 	}
1374 	return TRUE;
1375 
1376 failed:
1377 	if (err != 0) {
1378 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1379 		    __func__, if_name(ifp), err);
1380 	} else {
1381 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1382 		    __func__, if_name(ifp));
1383 	}
1384 	if (attr != NULL) {
1385 		kern_nexus_attr_destroy(attr);
1386 	}
1387 	return FALSE;
1388 }
1389 
1390 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1391 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1392 {
1393 	boolean_t               attached;
1394 	if_nexus_flowswitch     nexus_fsw;
1395 
1396 #if (DEVELOPMENT || DEBUG)
1397 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1398 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1399 		return FALSE;
1400 	}
1401 #endif /* (DEVELOPMENT || DEBUG) */
1402 
1403 	/*
1404 	 * flowswitch attachment is not supported for interface using the
1405 	 * legacy model (IFNET_INIT_LEGACY)
1406 	 */
1407 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1408 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1409 		    if_name(ifp));
1410 		return FALSE;
1411 	}
1412 
1413 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1414 		/* it's already attached */
1415 		return FALSE;
1416 	}
1417 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1418 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1419 	if (attached) {
1420 		ifnet_lock_exclusive(ifp);
1421 		if (!IF_FULLY_ATTACHED(ifp)) {
1422 			/* interface is going away */
1423 			attached = FALSE;
1424 		} else {
1425 			ifp->if_nx_flowswitch = nexus_fsw;
1426 		}
1427 		ifnet_lock_done(ifp);
1428 		if (!attached) {
1429 			/* clean up flowswitch nexus */
1430 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1431 		}
1432 	}
1433 	return attached;
1434 }
1435 
1436 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1437 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1438 {
1439 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1440 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1441 }
1442 
1443 boolean_t
ifnet_add_netagent(ifnet_t ifp)1444 ifnet_add_netagent(ifnet_t ifp)
1445 {
1446 	int     error;
1447 
1448 	error = kern_nexus_interface_add_netagent(ifp);
1449 	os_log(OS_LOG_DEFAULT,
1450 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1451 	    ifp->if_xname, error);
1452 	return error == 0;
1453 }
1454 
1455 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1456 ifnet_remove_netagent(ifnet_t ifp)
1457 {
1458 	int     error;
1459 
1460 	error = kern_nexus_interface_remove_netagent(ifp);
1461 	os_log(OS_LOG_DEFAULT,
1462 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1463 	    ifp->if_xname, error);
1464 	return error == 0;
1465 }
1466 
1467 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1468 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1469 {
1470 	if (!IF_FULLY_ATTACHED(ifp)) {
1471 		return FALSE;
1472 	}
1473 	return dlil_attach_flowswitch_nexus(ifp);
1474 }
1475 
1476 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1477 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1478 {
1479 	if_nexus_flowswitch     nexus_fsw;
1480 
1481 	ifnet_lock_exclusive(ifp);
1482 	nexus_fsw = ifp->if_nx_flowswitch;
1483 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1484 	ifnet_lock_done(ifp);
1485 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1486 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1487 }
1488 
1489 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1490 ifnet_attach_netif_nexus(ifnet_t ifp)
1491 {
1492 	boolean_t       nexus_attached;
1493 	if_nexus_netif  nexus_netif;
1494 
1495 	if (!IF_FULLY_ATTACHED(ifp)) {
1496 		return FALSE;
1497 	}
1498 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1499 	if (nexus_attached) {
1500 		ifnet_lock_exclusive(ifp);
1501 		ifp->if_nx_netif = nexus_netif;
1502 		ifnet_lock_done(ifp);
1503 	}
1504 	return nexus_attached;
1505 }
1506 
1507 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1508 ifnet_detach_netif_nexus(ifnet_t ifp)
1509 {
1510 	if_nexus_netif  nexus_netif;
1511 
1512 	ifnet_lock_exclusive(ifp);
1513 	nexus_netif = ifp->if_nx_netif;
1514 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1515 	ifnet_lock_done(ifp);
1516 
1517 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1518 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1519 }
1520 
1521 #endif /* SKYWALK */
1522 
1523 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1524 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1525 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1526 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1527 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1528 	/* NOTREACHED */                                        \
1529 	}                                                               \
1530 }
1531 
1532 #define DLIL_EWMA(old, new, decay) do {                                 \
1533 	u_int32_t _avg;                                                 \
1534 	if ((_avg = (old)) > 0)                                         \
1535 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1536 	else                                                            \
1537 	        _avg = (new);                                           \
1538 	(old) = _avg;                                                   \
1539 } while (0)
1540 
1541 #define MBPS    (1ULL * 1000 * 1000)
1542 #define GBPS    (MBPS * 1000)
1543 
1544 struct rxpoll_time_tbl {
1545 	u_int64_t       speed;          /* downlink speed */
1546 	u_int32_t       plowat;         /* packets low watermark */
1547 	u_int32_t       phiwat;         /* packets high watermark */
1548 	u_int32_t       blowat;         /* bytes low watermark */
1549 	u_int32_t       bhiwat;         /* bytes high watermark */
1550 };
1551 
1552 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1553 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1554 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1555 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1556 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1557 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1558 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1559 };
1560 
1561 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1562     &dlil_lck_attributes);
1563 static uint32_t dlil_pending_thread_cnt = 0;
1564 
1565 static void
dlil_incr_pending_thread_count(void)1566 dlil_incr_pending_thread_count(void)
1567 {
1568 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1569 	lck_mtx_lock(&dlil_thread_sync_lock);
1570 	dlil_pending_thread_cnt++;
1571 	lck_mtx_unlock(&dlil_thread_sync_lock);
1572 }
1573 
1574 static void
dlil_decr_pending_thread_count(void)1575 dlil_decr_pending_thread_count(void)
1576 {
1577 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1578 	lck_mtx_lock(&dlil_thread_sync_lock);
1579 	VERIFY(dlil_pending_thread_cnt > 0);
1580 	dlil_pending_thread_cnt--;
1581 	if (dlil_pending_thread_cnt == 0) {
1582 		wakeup(&dlil_pending_thread_cnt);
1583 	}
1584 	lck_mtx_unlock(&dlil_thread_sync_lock);
1585 }
1586 
1587 int
proto_hash_value(u_int32_t protocol_family)1588 proto_hash_value(u_int32_t protocol_family)
1589 {
1590 	/*
1591 	 * dlil_proto_unplumb_all() depends on the mapping between
1592 	 * the hash bucket index and the protocol family defined
1593 	 * here; future changes must be applied there as well.
1594 	 */
1595 	switch (protocol_family) {
1596 	case PF_INET:
1597 		return 0;
1598 	case PF_INET6:
1599 		return 1;
1600 	case PF_VLAN:
1601 		return 2;
1602 	case PF_802154:
1603 		return 3;
1604 	case PF_UNSPEC:
1605 	default:
1606 		return 4;
1607 	}
1608 }
1609 
1610 /*
1611  * Caller must already be holding ifnet lock.
1612  */
1613 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1614 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1615 {
1616 	struct if_proto *proto = NULL;
1617 	u_int32_t i = proto_hash_value(protocol_family);
1618 
1619 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1620 
1621 	if (ifp->if_proto_hash != NULL) {
1622 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1623 	}
1624 
1625 	while (proto != NULL && proto->protocol_family != protocol_family) {
1626 		proto = SLIST_NEXT(proto, next_hash);
1627 	}
1628 
1629 	if (proto != NULL) {
1630 		if_proto_ref(proto);
1631 	}
1632 
1633 	return proto;
1634 }
1635 
1636 static void
if_proto_ref(struct if_proto * proto)1637 if_proto_ref(struct if_proto *proto)
1638 {
1639 	atomic_add_32(&proto->refcount, 1);
1640 }
1641 
1642 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1643 
1644 static void
if_proto_free(struct if_proto * proto)1645 if_proto_free(struct if_proto *proto)
1646 {
1647 	u_int32_t oldval;
1648 	struct ifnet *ifp = proto->ifp;
1649 	u_int32_t proto_family = proto->protocol_family;
1650 	struct kev_dl_proto_data ev_pr_data;
1651 
1652 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1653 	if (oldval > 1) {
1654 		return;
1655 	}
1656 
1657 	if (proto->proto_kpi == kProtoKPI_v1) {
1658 		if (proto->kpi.v1.detached) {
1659 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1660 		}
1661 	}
1662 	if (proto->proto_kpi == kProtoKPI_v2) {
1663 		if (proto->kpi.v2.detached) {
1664 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1665 		}
1666 	}
1667 
1668 	/*
1669 	 * Cleanup routes that may still be in the routing table for that
1670 	 * interface/protocol pair.
1671 	 */
1672 	if_rtproto_del(ifp, proto_family);
1673 
1674 	ifnet_lock_shared(ifp);
1675 
1676 	/* No more reference on this, protocol must have been detached */
1677 	VERIFY(proto->detached);
1678 
1679 	/*
1680 	 * The reserved field carries the number of protocol still attached
1681 	 * (subject to change)
1682 	 */
1683 	ev_pr_data.proto_family = proto_family;
1684 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1685 
1686 	ifnet_lock_done(ifp);
1687 
1688 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1689 	    (struct net_event_data *)&ev_pr_data,
1690 	    sizeof(struct kev_dl_proto_data));
1691 
1692 	if (ev_pr_data.proto_remaining_count == 0) {
1693 		/*
1694 		 * The protocol count has gone to zero, mark the interface down.
1695 		 * This used to be done by configd.KernelEventMonitor, but that
1696 		 * is inherently prone to races (rdar://problem/30810208).
1697 		 */
1698 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1699 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1700 		dlil_post_sifflags_msg(ifp);
1701 	}
1702 
1703 	zfree(dlif_proto_zone, proto);
1704 }
1705 
1706 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1707 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1708 {
1709 #if !MACH_ASSERT
1710 #pragma unused(ifp)
1711 #endif
1712 	unsigned int type = 0;
1713 	int ass = 1;
1714 
1715 	switch (what) {
1716 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1717 		type = LCK_RW_ASSERT_EXCLUSIVE;
1718 		break;
1719 
1720 	case IFNET_LCK_ASSERT_SHARED:
1721 		type = LCK_RW_ASSERT_SHARED;
1722 		break;
1723 
1724 	case IFNET_LCK_ASSERT_OWNED:
1725 		type = LCK_RW_ASSERT_HELD;
1726 		break;
1727 
1728 	case IFNET_LCK_ASSERT_NOTOWNED:
1729 		/* nothing to do here for RW lock; bypass assert */
1730 		ass = 0;
1731 		break;
1732 
1733 	default:
1734 		panic("bad ifnet assert type: %d", what);
1735 		/* NOTREACHED */
1736 	}
1737 	if (ass) {
1738 		LCK_RW_ASSERT(&ifp->if_lock, type);
1739 	}
1740 }
1741 
1742 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1743 ifnet_lock_shared(struct ifnet *ifp)
1744 {
1745 	lck_rw_lock_shared(&ifp->if_lock);
1746 }
1747 
1748 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1749 ifnet_lock_exclusive(struct ifnet *ifp)
1750 {
1751 	lck_rw_lock_exclusive(&ifp->if_lock);
1752 }
1753 
1754 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1755 ifnet_lock_done(struct ifnet *ifp)
1756 {
1757 	lck_rw_done(&ifp->if_lock);
1758 }
1759 
1760 #if INET
1761 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1762 if_inetdata_lock_shared(struct ifnet *ifp)
1763 {
1764 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1765 }
1766 
1767 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1768 if_inetdata_lock_exclusive(struct ifnet *ifp)
1769 {
1770 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1771 }
1772 
1773 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1774 if_inetdata_lock_done(struct ifnet *ifp)
1775 {
1776 	lck_rw_done(&ifp->if_inetdata_lock);
1777 }
1778 #endif
1779 
1780 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1781 if_inet6data_lock_shared(struct ifnet *ifp)
1782 {
1783 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1784 }
1785 
1786 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1787 if_inet6data_lock_exclusive(struct ifnet *ifp)
1788 {
1789 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1790 }
1791 
1792 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1793 if_inet6data_lock_done(struct ifnet *ifp)
1794 {
1795 	lck_rw_done(&ifp->if_inet6data_lock);
1796 }
1797 
1798 __private_extern__ void
ifnet_head_lock_shared(void)1799 ifnet_head_lock_shared(void)
1800 {
1801 	lck_rw_lock_shared(&ifnet_head_lock);
1802 }
1803 
1804 __private_extern__ void
ifnet_head_lock_exclusive(void)1805 ifnet_head_lock_exclusive(void)
1806 {
1807 	lck_rw_lock_exclusive(&ifnet_head_lock);
1808 }
1809 
1810 __private_extern__ void
ifnet_head_done(void)1811 ifnet_head_done(void)
1812 {
1813 	lck_rw_done(&ifnet_head_lock);
1814 }
1815 
1816 __private_extern__ void
ifnet_head_assert_exclusive(void)1817 ifnet_head_assert_exclusive(void)
1818 {
1819 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1820 }
1821 
1822 /*
1823  * dlil_ifp_protolist
1824  * - get the list of protocols attached to the interface, or just the number
1825  *   of attached protocols
1826  * - if the number returned is greater than 'list_count', truncation occurred
1827  *
1828  * Note:
1829  * - caller must already be holding ifnet lock.
1830  */
1831 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1832 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1833     u_int32_t list_count)
1834 {
1835 	u_int32_t       count = 0;
1836 	int             i;
1837 
1838 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1839 
1840 	if (ifp->if_proto_hash == NULL) {
1841 		goto done;
1842 	}
1843 
1844 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1845 		struct if_proto *proto;
1846 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1847 			if (list != NULL && count < list_count) {
1848 				list[count] = proto->protocol_family;
1849 			}
1850 			count++;
1851 		}
1852 	}
1853 done:
1854 	return count;
1855 }
1856 
1857 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1858 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1859 {
1860 	ifnet_lock_shared(ifp);
1861 	count = dlil_ifp_protolist(ifp, protolist, count);
1862 	ifnet_lock_done(ifp);
1863 	return count;
1864 }
1865 
1866 __private_extern__ void
if_free_protolist(u_int32_t * list)1867 if_free_protolist(u_int32_t *list)
1868 {
1869 	kfree_data_addr(list);
1870 }
1871 
1872 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len)1873 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1874     u_int32_t event_code, struct net_event_data *event_data,
1875     u_int32_t event_data_len)
1876 {
1877 	struct net_event_data ev_data;
1878 	struct kev_msg ev_msg;
1879 
1880 	bzero(&ev_msg, sizeof(ev_msg));
1881 	bzero(&ev_data, sizeof(ev_data));
1882 	/*
1883 	 * a net event always starts with a net_event_data structure
1884 	 * but the caller can generate a simple net event or
1885 	 * provide a longer event structure to post
1886 	 */
1887 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1888 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1889 	ev_msg.kev_subclass     = event_subclass;
1890 	ev_msg.event_code       = event_code;
1891 
1892 	if (event_data == NULL) {
1893 		event_data = &ev_data;
1894 		event_data_len = sizeof(struct net_event_data);
1895 	}
1896 
1897 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1898 	event_data->if_family = ifp->if_family;
1899 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
1900 
1901 	ev_msg.dv[0].data_length = event_data_len;
1902 	ev_msg.dv[0].data_ptr    = event_data;
1903 	ev_msg.dv[1].data_length = 0;
1904 
1905 	bool update_generation = true;
1906 	if (event_subclass == KEV_DL_SUBCLASS) {
1907 		/* Don't update interface generation for frequent link quality and state changes  */
1908 		switch (event_code) {
1909 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1910 		case KEV_DL_RRC_STATE_CHANGED:
1911 		case KEV_DL_NODE_PRESENCE:
1912 		case KEV_DL_NODE_ABSENCE:
1913 		case KEV_DL_PRIMARY_ELECTED:
1914 			update_generation = false;
1915 			break;
1916 		default:
1917 			break;
1918 		}
1919 	}
1920 
1921 	return dlil_event_internal(ifp, &ev_msg, update_generation);
1922 }
1923 
1924 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1925 dlil_alloc_local_stats(struct ifnet *ifp)
1926 {
1927 	int ret = EINVAL;
1928 	void *buf, *base, **pbuf;
1929 
1930 	if (ifp == NULL) {
1931 		goto end;
1932 	}
1933 
1934 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1935 		/* allocate tcpstat_local structure */
1936 		buf = zalloc_flags(dlif_tcpstat_zone,
1937 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1938 
1939 		/* Get the 64-bit aligned base address for this object */
1940 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1941 		    sizeof(u_int64_t));
1942 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1943 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1944 
1945 		/*
1946 		 * Wind back a pointer size from the aligned base and
1947 		 * save the original address so we can free it later.
1948 		 */
1949 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1950 		*pbuf = buf;
1951 		ifp->if_tcp_stat = base;
1952 
1953 		/* allocate udpstat_local structure */
1954 		buf = zalloc_flags(dlif_udpstat_zone,
1955 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1956 
1957 		/* Get the 64-bit aligned base address for this object */
1958 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1959 		    sizeof(u_int64_t));
1960 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1961 		    ((intptr_t)buf + dlif_udpstat_bufsize));
1962 
1963 		/*
1964 		 * Wind back a pointer size from the aligned base and
1965 		 * save the original address so we can free it later.
1966 		 */
1967 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1968 		*pbuf = buf;
1969 		ifp->if_udp_stat = base;
1970 
1971 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1972 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1973 
1974 		ret = 0;
1975 	}
1976 
1977 	if (ifp->if_ipv4_stat == NULL) {
1978 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1979 	}
1980 
1981 	if (ifp->if_ipv6_stat == NULL) {
1982 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1983 	}
1984 end:
1985 	if (ifp != NULL && ret != 0) {
1986 		if (ifp->if_tcp_stat != NULL) {
1987 			pbuf = (void **)
1988 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1989 			zfree(dlif_tcpstat_zone, *pbuf);
1990 			ifp->if_tcp_stat = NULL;
1991 		}
1992 		if (ifp->if_udp_stat != NULL) {
1993 			pbuf = (void **)
1994 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1995 			zfree(dlif_udpstat_zone, *pbuf);
1996 			ifp->if_udp_stat = NULL;
1997 		}
1998 		/* The macro kfree_type sets the passed pointer to NULL */
1999 		if (ifp->if_ipv4_stat != NULL) {
2000 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2001 		}
2002 		if (ifp->if_ipv6_stat != NULL) {
2003 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2004 		}
2005 	}
2006 
2007 	return ret;
2008 }
2009 
2010 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2011 dlil_reset_rxpoll_params(ifnet_t ifp)
2012 {
2013 	ASSERT(ifp != NULL);
2014 	ifnet_set_poll_cycle(ifp, NULL);
2015 	ifp->if_poll_update = 0;
2016 	ifp->if_poll_flags = 0;
2017 	ifp->if_poll_req = 0;
2018 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2019 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2020 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2021 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2022 	net_timerclear(&ifp->if_poll_mode_holdtime);
2023 	net_timerclear(&ifp->if_poll_mode_lasttime);
2024 	net_timerclear(&ifp->if_poll_sample_holdtime);
2025 	net_timerclear(&ifp->if_poll_sample_lasttime);
2026 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2027 }
2028 
2029 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2030 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2031     thread_continue_t *thfunc)
2032 {
2033 	boolean_t dlil_rxpoll_input;
2034 	thread_continue_t func = NULL;
2035 	u_int32_t limit;
2036 	int error = 0;
2037 
2038 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2039 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2040 
2041 	/* default strategy utilizes the DLIL worker thread */
2042 	inp->dlth_strategy = dlil_input_async;
2043 
2044 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2045 	if (ifp == NULL) {
2046 		/*
2047 		 * Main input thread only.
2048 		 */
2049 		func = dlil_main_input_thread_func;
2050 		VERIFY(inp == dlil_main_input_thread);
2051 		(void) strlcat(inp->dlth_name,
2052 		    "main_input", DLIL_THREADNAME_LEN);
2053 	} else if (dlil_rxpoll_input) {
2054 		/*
2055 		 * Legacy (non-netif) hybrid polling.
2056 		 */
2057 		func = dlil_rxpoll_input_thread_func;
2058 		VERIFY(inp != dlil_main_input_thread);
2059 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2060 		    "%s_input_poll", if_name(ifp));
2061 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2062 		/*
2063 		 * Asynchronous strategy.
2064 		 */
2065 		func = dlil_input_thread_func;
2066 		VERIFY(inp != dlil_main_input_thread);
2067 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2068 		    "%s_input", if_name(ifp));
2069 	} else {
2070 		/*
2071 		 * Synchronous strategy if there's a netif below and
2072 		 * the device isn't capable of hybrid polling.
2073 		 */
2074 		ASSERT(func == NULL);
2075 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2076 		VERIFY(inp != dlil_main_input_thread);
2077 		ASSERT(!inp->dlth_affinity);
2078 		inp->dlth_strategy = dlil_input_sync;
2079 	}
2080 	VERIFY(inp->dlth_thread == THREAD_NULL);
2081 
2082 	/* let caller know */
2083 	if (thfunc != NULL) {
2084 		*thfunc = func;
2085 	}
2086 
2087 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2088 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2089 
2090 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2091 	/*
2092 	 * For interfaces that support opportunistic polling, set the
2093 	 * low and high watermarks for outstanding inbound packets/bytes.
2094 	 * Also define freeze times for transitioning between modes
2095 	 * and updating the average.
2096 	 */
2097 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2098 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2099 		if (ifp->if_xflags & IFXF_LEGACY) {
2100 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2101 		}
2102 	} else {
2103 		limit = (u_int32_t)-1;
2104 	}
2105 
2106 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2107 	if (inp == dlil_main_input_thread) {
2108 		struct dlil_main_threading_info *inpm =
2109 		    (struct dlil_main_threading_info *)inp;
2110 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2111 	}
2112 
2113 	if (func == NULL) {
2114 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2115 		ASSERT(error == 0);
2116 		error = ENODEV;
2117 		goto done;
2118 	}
2119 
2120 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2121 	if (error == KERN_SUCCESS) {
2122 		thread_precedence_policy_data_t info;
2123 		__unused kern_return_t kret;
2124 
2125 		bzero(&info, sizeof(info));
2126 		info.importance = 0;
2127 		kret = thread_policy_set(inp->dlth_thread,
2128 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2129 		    THREAD_PRECEDENCE_POLICY_COUNT);
2130 		ASSERT(kret == KERN_SUCCESS);
2131 		/*
2132 		 * We create an affinity set so that the matching workloop
2133 		 * thread or the starter thread (for loopback) can be
2134 		 * scheduled on the same processor set as the input thread.
2135 		 */
2136 		if (net_affinity) {
2137 			struct thread *tp = inp->dlth_thread;
2138 			u_int32_t tag;
2139 			/*
2140 			 * Randomize to reduce the probability
2141 			 * of affinity tag namespace collision.
2142 			 */
2143 			read_frandom(&tag, sizeof(tag));
2144 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2145 				thread_reference(tp);
2146 				inp->dlth_affinity_tag = tag;
2147 				inp->dlth_affinity = TRUE;
2148 			}
2149 		}
2150 	} else if (inp == dlil_main_input_thread) {
2151 		panic_plain("%s: couldn't create main input thread", __func__);
2152 		/* NOTREACHED */
2153 	} else {
2154 		panic_plain("%s: couldn't create %s input thread", __func__,
2155 		    if_name(ifp));
2156 		/* NOTREACHED */
2157 	}
2158 	OSAddAtomic(1, &cur_dlil_input_threads);
2159 
2160 done:
2161 	return error;
2162 }
2163 
2164 #if TEST_INPUT_THREAD_TERMINATION
2165 static int
2166 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2167 {
2168 #pragma unused(arg1, arg2)
2169 	uint32_t i;
2170 	int err;
2171 
2172 	i = if_input_thread_termination_spin;
2173 
2174 	err = sysctl_handle_int(oidp, &i, 0, req);
2175 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2176 		return err;
2177 	}
2178 
2179 	if (net_rxpoll == 0) {
2180 		return ENXIO;
2181 	}
2182 
2183 	if_input_thread_termination_spin = i;
2184 	return err;
2185 }
2186 #endif /* TEST_INPUT_THREAD_TERMINATION */
2187 
2188 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2189 dlil_clean_threading_info(struct dlil_threading_info *inp)
2190 {
2191 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2192 	lck_grp_free(inp->dlth_lock_grp);
2193 	inp->dlth_lock_grp = NULL;
2194 
2195 	inp->dlth_flags = 0;
2196 	inp->dlth_wtot = 0;
2197 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2198 	inp->dlth_ifp = NULL;
2199 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2200 	qlimit(&inp->dlth_pkts) = 0;
2201 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2202 
2203 	VERIFY(!inp->dlth_affinity);
2204 	inp->dlth_thread = THREAD_NULL;
2205 	inp->dlth_strategy = NULL;
2206 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2207 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2208 	VERIFY(inp->dlth_affinity_tag == 0);
2209 #if IFNET_INPUT_SANITY_CHK
2210 	inp->dlth_pkts_cnt = 0;
2211 #endif /* IFNET_INPUT_SANITY_CHK */
2212 }
2213 
2214 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2215 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2216 {
2217 	struct ifnet *ifp = inp->dlth_ifp;
2218 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2219 
2220 	VERIFY(current_thread() == inp->dlth_thread);
2221 	VERIFY(inp != dlil_main_input_thread);
2222 
2223 	OSAddAtomic(-1, &cur_dlil_input_threads);
2224 
2225 #if TEST_INPUT_THREAD_TERMINATION
2226 	{ /* do something useless that won't get optimized away */
2227 		uint32_t        v = 1;
2228 		for (uint32_t i = 0;
2229 		    i < if_input_thread_termination_spin;
2230 		    i++) {
2231 			v = (i + 1) * v;
2232 		}
2233 		DLIL_PRINTF("the value is %d\n", v);
2234 	}
2235 #endif /* TEST_INPUT_THREAD_TERMINATION */
2236 
2237 	lck_mtx_lock_spin(&inp->dlth_lock);
2238 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2239 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2240 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2241 	wakeup_one((caddr_t)&inp->dlth_flags);
2242 	lck_mtx_unlock(&inp->dlth_lock);
2243 
2244 	/* free up pending packets */
2245 	if (pkt.cp_mbuf != NULL) {
2246 		mbuf_freem_list(pkt.cp_mbuf);
2247 	}
2248 
2249 	/* for the extra refcnt from kernel_thread_start() */
2250 	thread_deallocate(current_thread());
2251 
2252 	if (dlil_verbose) {
2253 		DLIL_PRINTF("%s: input thread terminated\n",
2254 		    if_name(ifp));
2255 	}
2256 
2257 	/* this is the end */
2258 	thread_terminate(current_thread());
2259 	/* NOTREACHED */
2260 }
2261 
2262 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2263 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2264 {
2265 	thread_affinity_policy_data_t policy;
2266 
2267 	bzero(&policy, sizeof(policy));
2268 	policy.affinity_tag = tag;
2269 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2270 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2271 }
2272 
2273 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2274 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2275 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2276     enum net_filter_event_subsystems state)
2277 {
2278 	if (state == 0) {
2279 		if_enable_fsw_transport_netagent = 1;
2280 	} else {
2281 		if_enable_fsw_transport_netagent = 0;
2282 	}
2283 	kern_nexus_update_netagents();
2284 }
2285 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2286 
2287 void
dlil_init(void)2288 dlil_init(void)
2289 {
2290 	thread_t thread = THREAD_NULL;
2291 
2292 	/*
2293 	 * The following fields must be 64-bit aligned for atomic operations.
2294 	 */
2295 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2296 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2297 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2298 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2299 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2300 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2301 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2302 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2303 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2304 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2305 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2306 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2307 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2308 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2309 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2310 
2311 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2312 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2313 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2314 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2315 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2316 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2317 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2318 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2319 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2320 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2321 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2322 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2323 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2324 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2325 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2326 
2327 	/*
2328 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2329 	 */
2330 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2331 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2332 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2333 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2334 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2335 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2336 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2337 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2338 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2339 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2340 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2341 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2342 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2343 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2344 
2345 	/*
2346 	 * ... as well as the mbuf checksum flags counterparts.
2347 	 */
2348 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2349 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2350 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2351 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2352 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2353 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2354 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2355 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2356 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2357 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2358 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2359 
2360 	/*
2361 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2362 	 */
2363 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2364 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2365 
2366 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2367 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2368 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2369 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2370 
2371 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2372 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2373 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2374 
2375 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2376 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2377 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2378 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2379 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2380 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2381 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2382 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2383 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2384 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2385 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2386 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2387 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2388 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2389 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2390 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2391 	_CASSERT(IFRTYPE_FAMILY_6LOWPAN == IFNET_FAMILY_6LOWPAN);
2392 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2393 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2394 
2395 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2396 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2397 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2398 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2399 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2400 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2401 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2402 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2403 	_CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2404 
2405 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2406 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2407 
2408 	PE_parse_boot_argn("net_affinity", &net_affinity,
2409 	    sizeof(net_affinity));
2410 
2411 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2412 
2413 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2414 
2415 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2416 
2417 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2418 
2419 	VERIFY(dlil_pending_thread_cnt == 0);
2420 #if SKYWALK
2421 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2422 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2423 	boolean_t enable_fsw_netagent =
2424 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2425 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2426 
2427 	/*
2428 	 * Check the device tree to see if Skywalk netagent has been explicitly
2429 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2430 	 * Note that the property is a 0-length key, and so checking for the
2431 	 * presence itself is enough (no need to check for the actual value of
2432 	 * the retrieved variable.)
2433 	 */
2434 	pe_enable_fsw_transport_netagent =
2435 	    PE_get_default("kern.skywalk_netagent_enable",
2436 	    &pe_enable_fsw_transport_netagent,
2437 	    sizeof(pe_enable_fsw_transport_netagent));
2438 	pe_disable_fsw_transport_netagent =
2439 	    PE_get_default("kern.skywalk_netagent_disable",
2440 	    &pe_disable_fsw_transport_netagent,
2441 	    sizeof(pe_disable_fsw_transport_netagent));
2442 
2443 	/*
2444 	 * These two are mutually exclusive, i.e. they both can be absent,
2445 	 * but only one can be present at a time, and so we assert to make
2446 	 * sure it is correct.
2447 	 */
2448 	VERIFY((!pe_enable_fsw_transport_netagent &&
2449 	    !pe_disable_fsw_transport_netagent) ||
2450 	    (pe_enable_fsw_transport_netagent ^
2451 	    pe_disable_fsw_transport_netagent));
2452 
2453 	if (pe_enable_fsw_transport_netagent) {
2454 		kprintf("SK: netagent is enabled via an override for "
2455 		    "this platform\n");
2456 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2457 	} else if (pe_disable_fsw_transport_netagent) {
2458 		kprintf("SK: netagent is disabled via an override for "
2459 		    "this platform\n");
2460 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2461 	} else {
2462 		kprintf("SK: netagent is %s by default for this platform\n",
2463 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2464 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2465 	}
2466 
2467 	/*
2468 	 * Now see if there's a boot-arg override.
2469 	 */
2470 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2471 	    sizeof(if_attach_nx));
2472 	if_enable_fsw_transport_netagent =
2473 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2474 
2475 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2476 
2477 	if (pe_disable_fsw_transport_netagent &&
2478 	    if_enable_fsw_transport_netagent) {
2479 		kprintf("SK: netagent is force-enabled\n");
2480 	} else if (!pe_disable_fsw_transport_netagent &&
2481 	    !if_enable_fsw_transport_netagent) {
2482 		kprintf("SK: netagent is force-disabled\n");
2483 	}
2484 #ifdef XNU_TARGET_OS_OSX
2485 	if (if_enable_fsw_transport_netagent) {
2486 		net_filter_event_register(dlil_filter_event);
2487 	}
2488 #endif /* XNU_TARGET_OS_OSX */
2489 
2490 #if (DEVELOPMENT || DEBUG)
2491 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2492 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2493 #endif /* (DEVELOPMENT || DEBUG) */
2494 
2495 #endif /* SKYWALK */
2496 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2497 	    sizeof(struct dlil_ifnet_dbg);
2498 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2499 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2500 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2501 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2502 
2503 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2504 	/* Enforce 64-bit alignment for tcpstat_local structure */
2505 	dlif_tcpstat_bufsize =
2506 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2507 	dlif_tcpstat_bufsize = (uint32_t)
2508 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2509 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2510 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2511 
2512 	dlif_udpstat_size = sizeof(struct udpstat_local);
2513 	/* Enforce 64-bit alignment for udpstat_local structure */
2514 	dlif_udpstat_bufsize =
2515 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2516 	dlif_udpstat_bufsize = (uint32_t)
2517 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2518 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2519 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2520 
2521 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2522 
2523 	TAILQ_INIT(&dlil_ifnet_head);
2524 	TAILQ_INIT(&ifnet_head);
2525 	TAILQ_INIT(&ifnet_detaching_head);
2526 	TAILQ_INIT(&ifnet_ordered_head);
2527 
2528 	/* Initialize interface address subsystem */
2529 	ifa_init();
2530 
2531 #if PF
2532 	/* Initialize the packet filter */
2533 	pfinit();
2534 #endif /* PF */
2535 
2536 	/* Initialize queue algorithms */
2537 	classq_init();
2538 
2539 	/* Initialize packet schedulers */
2540 	pktsched_init();
2541 
2542 	/* Initialize flow advisory subsystem */
2543 	flowadv_init();
2544 
2545 	/* Initialize the pktap virtual interface */
2546 	pktap_init();
2547 
2548 	/* Initialize the service class to dscp map */
2549 	net_qos_map_init();
2550 
2551 	/* Initialize the interface low power mode event handler */
2552 	if_low_power_evhdlr_init();
2553 
2554 	/* Initialize the interface offload port list subsystem */
2555 	if_ports_used_init();
2556 
2557 #if DEBUG || DEVELOPMENT
2558 	/* Run self-tests */
2559 	dlil_verify_sum16();
2560 #endif /* DEBUG || DEVELOPMENT */
2561 
2562 	/*
2563 	 * Create and start up the main DLIL input thread and the interface
2564 	 * detacher threads once everything is initialized.
2565 	 */
2566 	dlil_incr_pending_thread_count();
2567 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2568 
2569 	/*
2570 	 * Create ifnet detacher thread.
2571 	 * When an interface gets detached, part of the detach processing
2572 	 * is delayed. The interface is added to delayed detach list
2573 	 * and this thread is woken up to call ifnet_detach_final
2574 	 * on these interfaces.
2575 	 */
2576 	dlil_incr_pending_thread_count();
2577 	if (kernel_thread_start(ifnet_detacher_thread_func,
2578 	    NULL, &thread) != KERN_SUCCESS) {
2579 		panic_plain("%s: couldn't create detacher thread", __func__);
2580 		/* NOTREACHED */
2581 	}
2582 	thread_deallocate(thread);
2583 
2584 	/*
2585 	 * Wait for the created kernel threads for dlil to get
2586 	 * scheduled and run at least once before we proceed
2587 	 */
2588 	lck_mtx_lock(&dlil_thread_sync_lock);
2589 	while (dlil_pending_thread_cnt != 0) {
2590 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2591 		    "threads to get scheduled at least once.\n", __func__);
2592 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2593 		    (PZERO - 1), __func__, NULL);
2594 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2595 	}
2596 	lck_mtx_unlock(&dlil_thread_sync_lock);
2597 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2598 	    "scheduled at least once. Proceeding.\n", __func__);
2599 }
2600 
2601 static void
if_flt_monitor_busy(struct ifnet * ifp)2602 if_flt_monitor_busy(struct ifnet *ifp)
2603 {
2604 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2605 
2606 	++ifp->if_flt_busy;
2607 	VERIFY(ifp->if_flt_busy != 0);
2608 }
2609 
2610 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2611 if_flt_monitor_unbusy(struct ifnet *ifp)
2612 {
2613 	if_flt_monitor_leave(ifp);
2614 }
2615 
2616 static void
if_flt_monitor_enter(struct ifnet * ifp)2617 if_flt_monitor_enter(struct ifnet *ifp)
2618 {
2619 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2620 
2621 	while (ifp->if_flt_busy) {
2622 		++ifp->if_flt_waiters;
2623 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2624 		    (PZERO - 1), "if_flt_monitor", NULL);
2625 	}
2626 	if_flt_monitor_busy(ifp);
2627 }
2628 
2629 static void
if_flt_monitor_leave(struct ifnet * ifp)2630 if_flt_monitor_leave(struct ifnet *ifp)
2631 {
2632 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2633 
2634 	VERIFY(ifp->if_flt_busy != 0);
2635 	--ifp->if_flt_busy;
2636 
2637 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2638 		ifp->if_flt_waiters = 0;
2639 		wakeup(&ifp->if_flt_head);
2640 	}
2641 }
2642 
2643 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2644 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2645     interface_filter_t *filter_ref, u_int32_t flags)
2646 {
2647 	int retval = 0;
2648 	struct ifnet_filter *filter = NULL;
2649 
2650 	ifnet_head_lock_shared();
2651 
2652 	/* Check that the interface is in the global list */
2653 	if (!ifnet_lookup(ifp)) {
2654 		retval = ENXIO;
2655 		goto done;
2656 	}
2657 	if (!ifnet_is_attached(ifp, 1)) {
2658 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2659 		    __func__, if_name(ifp));
2660 		retval = ENXIO;
2661 		goto done;
2662 	}
2663 
2664 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2665 
2666 	/* refcnt held above during lookup */
2667 	filter->filt_flags = flags;
2668 	filter->filt_ifp = ifp;
2669 	filter->filt_cookie = if_filter->iff_cookie;
2670 	filter->filt_name = if_filter->iff_name;
2671 	filter->filt_protocol = if_filter->iff_protocol;
2672 	/*
2673 	 * Do not install filter callbacks for internal coproc interface
2674 	 */
2675 	if (!IFNET_IS_INTCOPROC(ifp)) {
2676 		filter->filt_input = if_filter->iff_input;
2677 		filter->filt_output = if_filter->iff_output;
2678 		filter->filt_event = if_filter->iff_event;
2679 		filter->filt_ioctl = if_filter->iff_ioctl;
2680 	}
2681 	filter->filt_detached = if_filter->iff_detached;
2682 
2683 	lck_mtx_lock(&ifp->if_flt_lock);
2684 	if_flt_monitor_enter(ifp);
2685 
2686 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2687 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2688 
2689 	*filter_ref = filter;
2690 
2691 	/*
2692 	 * Bump filter count and route_generation ID to let TCP
2693 	 * know it shouldn't do TSO on this connection
2694 	 */
2695 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2696 		ifnet_filter_update_tso(ifp, TRUE);
2697 	}
2698 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2699 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2700 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2701 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2702 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2703 	} else {
2704 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2705 	}
2706 	if_flt_monitor_leave(ifp);
2707 	lck_mtx_unlock(&ifp->if_flt_lock);
2708 
2709 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2710 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2711 	    net_check_compatible_if_filter(NULL));
2712 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2713 
2714 	if (dlil_verbose) {
2715 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2716 		    if_filter->iff_name);
2717 	}
2718 	ifnet_decr_iorefcnt(ifp);
2719 
2720 done:
2721 	ifnet_head_done();
2722 	if (retval != 0 && ifp != NULL) {
2723 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2724 		    if_name(ifp), if_filter->iff_name, retval);
2725 	}
2726 	if (retval != 0 && filter != NULL) {
2727 		zfree(dlif_filt_zone, filter);
2728 	}
2729 
2730 	return retval;
2731 }
2732 
2733 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2734 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2735 {
2736 	int retval = 0;
2737 
2738 	if (detached == 0) {
2739 		ifnet_t ifp = NULL;
2740 
2741 		ifnet_head_lock_shared();
2742 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2743 			interface_filter_t entry = NULL;
2744 
2745 			lck_mtx_lock(&ifp->if_flt_lock);
2746 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2747 				if (entry != filter || entry->filt_skip) {
2748 					continue;
2749 				}
2750 				/*
2751 				 * We've found a match; since it's possible
2752 				 * that the thread gets blocked in the monitor,
2753 				 * we do the lock dance.  Interface should
2754 				 * not be detached since we still have a use
2755 				 * count held during filter attach.
2756 				 */
2757 				entry->filt_skip = 1;   /* skip input/output */
2758 				lck_mtx_unlock(&ifp->if_flt_lock);
2759 				ifnet_head_done();
2760 
2761 				lck_mtx_lock(&ifp->if_flt_lock);
2762 				if_flt_monitor_enter(ifp);
2763 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2764 				    LCK_MTX_ASSERT_OWNED);
2765 
2766 				/* Remove the filter from the list */
2767 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2768 				    filt_next);
2769 
2770 				if (dlil_verbose) {
2771 					DLIL_PRINTF("%s: %s filter detached\n",
2772 					    if_name(ifp), filter->filt_name);
2773 				}
2774 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2775 					VERIFY(ifp->if_flt_non_os_count != 0);
2776 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2777 				}
2778 				/*
2779 				 * Decrease filter count and route_generation
2780 				 * ID to let TCP know it should reevalute doing
2781 				 * TSO or not.
2782 				 */
2783 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2784 					ifnet_filter_update_tso(ifp, FALSE);
2785 				}
2786 				if_flt_monitor_leave(ifp);
2787 				lck_mtx_unlock(&ifp->if_flt_lock);
2788 				goto destroy;
2789 			}
2790 			lck_mtx_unlock(&ifp->if_flt_lock);
2791 		}
2792 		ifnet_head_done();
2793 
2794 		/* filter parameter is not a valid filter ref */
2795 		retval = EINVAL;
2796 		goto done;
2797 	} else {
2798 		struct ifnet *ifp = filter->filt_ifp;
2799 		/*
2800 		 * Here we are called from ifnet_detach_final(); the
2801 		 * caller had emptied if_flt_head and we're doing an
2802 		 * implicit filter detach because the interface is
2803 		 * about to go away.  Make sure to adjust the counters
2804 		 * in this case.  We don't need the protection of the
2805 		 * filter monitor since we're called as part of the
2806 		 * final detach in the context of the detacher thread.
2807 		 */
2808 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2809 			VERIFY(ifp->if_flt_non_os_count != 0);
2810 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2811 		}
2812 		/*
2813 		 * Decrease filter count and route_generation
2814 		 * ID to let TCP know it should reevalute doing
2815 		 * TSO or not.
2816 		 */
2817 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2818 			ifnet_filter_update_tso(ifp, FALSE);
2819 		}
2820 	}
2821 
2822 	if (dlil_verbose) {
2823 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2824 	}
2825 
2826 destroy:
2827 
2828 	/* Call the detached function if there is one */
2829 	if (filter->filt_detached) {
2830 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2831 	}
2832 
2833 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2834 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2835 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2836 	}
2837 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2838 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2839 	    net_check_compatible_if_filter(NULL));
2840 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2841 
2842 	/* Free the filter */
2843 	zfree(dlif_filt_zone, filter);
2844 	filter = NULL;
2845 done:
2846 	if (retval != 0 && filter != NULL) {
2847 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2848 		    filter->filt_name, retval);
2849 	}
2850 
2851 	return retval;
2852 }
2853 
2854 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2855 dlil_detach_filter(interface_filter_t filter)
2856 {
2857 	if (filter == NULL) {
2858 		return;
2859 	}
2860 	dlil_detach_filter_internal(filter, 0);
2861 }
2862 
2863 __private_extern__ boolean_t
dlil_has_ip_filter(void)2864 dlil_has_ip_filter(void)
2865 {
2866 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2867 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2868 	return has_filter;
2869 }
2870 
2871 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2872 dlil_has_if_filter(struct ifnet *ifp)
2873 {
2874 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2875 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2876 	return has_filter;
2877 }
2878 
2879 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2880 dlil_input_wakeup(struct dlil_threading_info *inp)
2881 {
2882 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2883 
2884 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2885 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2886 		inp->dlth_wtot++;
2887 		wakeup_one((caddr_t)&inp->dlth_flags);
2888 	}
2889 }
2890 
2891 __attribute__((noreturn))
2892 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2893 dlil_main_input_thread_func(void *v, wait_result_t w)
2894 {
2895 #pragma unused(w)
2896 	struct dlil_threading_info *inp = v;
2897 
2898 	VERIFY(inp == dlil_main_input_thread);
2899 	VERIFY(inp->dlth_ifp == NULL);
2900 	VERIFY(current_thread() == inp->dlth_thread);
2901 
2902 	lck_mtx_lock(&inp->dlth_lock);
2903 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2904 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2905 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2906 	/* wake up once to get out of embryonic state */
2907 	dlil_input_wakeup(inp);
2908 	lck_mtx_unlock(&inp->dlth_lock);
2909 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2910 	/* NOTREACHED */
2911 	__builtin_unreachable();
2912 }
2913 
2914 /*
2915  * Main input thread:
2916  *
2917  *   a) handles all inbound packets for lo0
2918  *   b) handles all inbound packets for interfaces with no dedicated
2919  *	input thread (e.g. anything but Ethernet/PDP or those that support
2920  *	opportunistic polling.)
2921  *   c) protocol registrations
2922  *   d) packet injections
2923  */
2924 __attribute__((noreturn))
2925 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2926 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2927 {
2928 	struct dlil_main_threading_info *inpm = v;
2929 	struct dlil_threading_info *inp = v;
2930 
2931 	/* main input thread is uninterruptible */
2932 	VERIFY(wres != THREAD_INTERRUPTED);
2933 	lck_mtx_lock_spin(&inp->dlth_lock);
2934 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2935 	    DLIL_INPUT_RUNNING)));
2936 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2937 
2938 	while (1) {
2939 		struct mbuf *m = NULL, *m_loop = NULL;
2940 		u_int32_t m_cnt, m_cnt_loop;
2941 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2942 		boolean_t proto_req;
2943 		boolean_t embryonic;
2944 
2945 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2946 
2947 		if (__improbable(embryonic =
2948 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2949 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2950 		}
2951 
2952 		proto_req = (inp->dlth_flags &
2953 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2954 
2955 		/* Packets for non-dedicated interfaces other than lo0 */
2956 		m_cnt = qlen(&inp->dlth_pkts);
2957 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2958 		m = pkt.cp_mbuf;
2959 
2960 		/* Packets exclusive to lo0 */
2961 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2962 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2963 		m_loop = pkt.cp_mbuf;
2964 
2965 		inp->dlth_wtot = 0;
2966 
2967 		lck_mtx_unlock(&inp->dlth_lock);
2968 
2969 		if (__improbable(embryonic)) {
2970 			dlil_decr_pending_thread_count();
2971 		}
2972 
2973 		/*
2974 		 * NOTE warning %%% attention !!!!
2975 		 * We should think about putting some thread starvation
2976 		 * safeguards if we deal with long chains of packets.
2977 		 */
2978 		if (__probable(m_loop != NULL)) {
2979 			dlil_input_packet_list_extended(lo_ifp, m_loop,
2980 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2981 		}
2982 
2983 		if (__probable(m != NULL)) {
2984 			dlil_input_packet_list_extended(NULL, m,
2985 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2986 		}
2987 
2988 		if (__improbable(proto_req)) {
2989 			proto_input_run();
2990 		}
2991 
2992 		lck_mtx_lock_spin(&inp->dlth_lock);
2993 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2994 		/* main input thread cannot be terminated */
2995 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2996 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2997 			break;
2998 		}
2999 	}
3000 
3001 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3002 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3003 	lck_mtx_unlock(&inp->dlth_lock);
3004 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3005 
3006 	VERIFY(0);      /* we should never get here */
3007 	/* NOTREACHED */
3008 	__builtin_unreachable();
3009 }
3010 
3011 /*
3012  * Input thread for interfaces with legacy input model.
3013  */
3014 __attribute__((noreturn))
3015 static void
dlil_input_thread_func(void * v,wait_result_t w)3016 dlil_input_thread_func(void *v, wait_result_t w)
3017 {
3018 #pragma unused(w)
3019 	char thread_name[MAXTHREADNAMESIZE];
3020 	struct dlil_threading_info *inp = v;
3021 	struct ifnet *ifp = inp->dlth_ifp;
3022 
3023 	VERIFY(inp != dlil_main_input_thread);
3024 	VERIFY(ifp != NULL);
3025 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3026 	    !(ifp->if_xflags & IFXF_LEGACY));
3027 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3028 	    !(ifp->if_xflags & IFXF_LEGACY));
3029 	VERIFY(current_thread() == inp->dlth_thread);
3030 
3031 	/* construct the name for this thread, and then apply it */
3032 	bzero(thread_name, sizeof(thread_name));
3033 	(void) snprintf(thread_name, sizeof(thread_name),
3034 	    "dlil_input_%s", ifp->if_xname);
3035 	thread_set_thread_name(inp->dlth_thread, thread_name);
3036 
3037 	lck_mtx_lock(&inp->dlth_lock);
3038 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3039 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3040 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3041 	/* wake up once to get out of embryonic state */
3042 	dlil_input_wakeup(inp);
3043 	lck_mtx_unlock(&inp->dlth_lock);
3044 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3045 	/* NOTREACHED */
3046 	__builtin_unreachable();
3047 }
3048 
3049 __attribute__((noreturn))
3050 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3051 dlil_input_thread_cont(void *v, wait_result_t wres)
3052 {
3053 	struct dlil_threading_info *inp = v;
3054 	struct ifnet *ifp = inp->dlth_ifp;
3055 
3056 	lck_mtx_lock_spin(&inp->dlth_lock);
3057 	if (__improbable(wres == THREAD_INTERRUPTED ||
3058 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3059 		goto terminate;
3060 	}
3061 
3062 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3063 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3064 
3065 	while (1) {
3066 		struct mbuf *m = NULL;
3067 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3068 		boolean_t notify = FALSE;
3069 		boolean_t embryonic;
3070 		u_int32_t m_cnt;
3071 
3072 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3073 
3074 		if (__improbable(embryonic =
3075 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3076 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3077 		}
3078 
3079 		/*
3080 		 * Protocol registration and injection must always use
3081 		 * the main input thread; in theory the latter can utilize
3082 		 * the corresponding input thread where the packet arrived
3083 		 * on, but that requires our knowing the interface in advance
3084 		 * (and the benefits might not worth the trouble.)
3085 		 */
3086 		VERIFY(!(inp->dlth_flags &
3087 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3088 
3089 		/* Packets for this interface */
3090 		m_cnt = qlen(&inp->dlth_pkts);
3091 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3092 		m = pkt.cp_mbuf;
3093 
3094 		inp->dlth_wtot = 0;
3095 
3096 #if SKYWALK
3097 		/*
3098 		 * If this interface is attached to a netif nexus,
3099 		 * the stats are already incremented there; otherwise
3100 		 * do it here.
3101 		 */
3102 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3103 #endif /* SKYWALK */
3104 		notify = dlil_input_stats_sync(ifp, inp);
3105 
3106 		lck_mtx_unlock(&inp->dlth_lock);
3107 
3108 		if (__improbable(embryonic)) {
3109 			ifnet_decr_pending_thread_count(ifp);
3110 		}
3111 
3112 		if (__improbable(notify)) {
3113 			ifnet_notify_data_threshold(ifp);
3114 		}
3115 
3116 		/*
3117 		 * NOTE warning %%% attention !!!!
3118 		 * We should think about putting some thread starvation
3119 		 * safeguards if we deal with long chains of packets.
3120 		 */
3121 		if (__probable(m != NULL)) {
3122 			dlil_input_packet_list_extended(NULL, m,
3123 			    m_cnt, ifp->if_poll_mode);
3124 		}
3125 
3126 		lck_mtx_lock_spin(&inp->dlth_lock);
3127 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3128 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3129 		    DLIL_INPUT_TERMINATE))) {
3130 			break;
3131 		}
3132 	}
3133 
3134 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3135 
3136 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3137 terminate:
3138 		lck_mtx_unlock(&inp->dlth_lock);
3139 		dlil_terminate_input_thread(inp);
3140 		/* NOTREACHED */
3141 	} else {
3142 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3143 		lck_mtx_unlock(&inp->dlth_lock);
3144 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3145 		/* NOTREACHED */
3146 	}
3147 
3148 	VERIFY(0);      /* we should never get here */
3149 	/* NOTREACHED */
3150 	__builtin_unreachable();
3151 }
3152 
3153 /*
3154  * Input thread for interfaces with opportunistic polling input model.
3155  */
3156 __attribute__((noreturn))
3157 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3158 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3159 {
3160 #pragma unused(w)
3161 	char thread_name[MAXTHREADNAMESIZE];
3162 	struct dlil_threading_info *inp = v;
3163 	struct ifnet *ifp = inp->dlth_ifp;
3164 
3165 	VERIFY(inp != dlil_main_input_thread);
3166 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3167 	    (ifp->if_xflags & IFXF_LEGACY));
3168 	VERIFY(current_thread() == inp->dlth_thread);
3169 
3170 	/* construct the name for this thread, and then apply it */
3171 	bzero(thread_name, sizeof(thread_name));
3172 	(void) snprintf(thread_name, sizeof(thread_name),
3173 	    "dlil_input_poll_%s", ifp->if_xname);
3174 	thread_set_thread_name(inp->dlth_thread, thread_name);
3175 
3176 	lck_mtx_lock(&inp->dlth_lock);
3177 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3178 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3179 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3180 	/* wake up once to get out of embryonic state */
3181 	dlil_input_wakeup(inp);
3182 	lck_mtx_unlock(&inp->dlth_lock);
3183 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3184 	/* NOTREACHED */
3185 	__builtin_unreachable();
3186 }
3187 
3188 __attribute__((noreturn))
3189 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3190 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3191 {
3192 	struct dlil_threading_info *inp = v;
3193 	struct ifnet *ifp = inp->dlth_ifp;
3194 	struct timespec ts;
3195 
3196 	lck_mtx_lock_spin(&inp->dlth_lock);
3197 	if (__improbable(wres == THREAD_INTERRUPTED ||
3198 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3199 		goto terminate;
3200 	}
3201 
3202 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3203 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3204 
3205 	while (1) {
3206 		struct mbuf *m = NULL;
3207 		uint32_t m_cnt, poll_req = 0;
3208 		uint64_t m_size = 0;
3209 		ifnet_model_t mode;
3210 		struct timespec now, delta;
3211 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3212 		boolean_t notify;
3213 		boolean_t embryonic;
3214 		uint64_t ival;
3215 
3216 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3217 
3218 		if (__improbable(embryonic =
3219 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3220 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3221 			goto skip;
3222 		}
3223 
3224 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3225 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3226 		}
3227 
3228 		/* Link parameters changed? */
3229 		if (ifp->if_poll_update != 0) {
3230 			ifp->if_poll_update = 0;
3231 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3232 		}
3233 
3234 		/* Current operating mode */
3235 		mode = ifp->if_poll_mode;
3236 
3237 		/*
3238 		 * Protocol registration and injection must always use
3239 		 * the main input thread; in theory the latter can utilize
3240 		 * the corresponding input thread where the packet arrived
3241 		 * on, but that requires our knowing the interface in advance
3242 		 * (and the benefits might not worth the trouble.)
3243 		 */
3244 		VERIFY(!(inp->dlth_flags &
3245 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3246 
3247 		/* Total count of all packets */
3248 		m_cnt = qlen(&inp->dlth_pkts);
3249 
3250 		/* Total bytes of all packets */
3251 		m_size = qsize(&inp->dlth_pkts);
3252 
3253 		/* Packets for this interface */
3254 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3255 		m = pkt.cp_mbuf;
3256 		VERIFY(m != NULL || m_cnt == 0);
3257 
3258 		nanouptime(&now);
3259 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3260 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3261 		}
3262 
3263 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3264 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3265 			u_int32_t ptot, btot;
3266 
3267 			/* Accumulate statistics for current sampling */
3268 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3269 
3270 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3271 				goto skip;
3272 			}
3273 
3274 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3275 
3276 			/* Calculate min/max of inbound bytes */
3277 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3278 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3279 				ifp->if_rxpoll_bmin = btot;
3280 			}
3281 			if (btot > ifp->if_rxpoll_bmax) {
3282 				ifp->if_rxpoll_bmax = btot;
3283 			}
3284 
3285 			/* Calculate EWMA of inbound bytes */
3286 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3287 
3288 			/* Calculate min/max of inbound packets */
3289 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3290 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3291 				ifp->if_rxpoll_pmin = ptot;
3292 			}
3293 			if (ptot > ifp->if_rxpoll_pmax) {
3294 				ifp->if_rxpoll_pmax = ptot;
3295 			}
3296 
3297 			/* Calculate EWMA of inbound packets */
3298 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3299 
3300 			/* Reset sampling statistics */
3301 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3302 
3303 			/* Calculate EWMA of wakeup requests */
3304 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3305 			    if_rxpoll_decay);
3306 			inp->dlth_wtot = 0;
3307 
3308 			if (dlil_verbose) {
3309 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3310 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3311 				}
3312 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3313 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3314 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3315 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3316 					    "limits [%d/%d], wreq avg %d "
3317 					    "limits [%d/%d], bytes avg %d "
3318 					    "limits [%d/%d]\n", if_name(ifp),
3319 					    (ifp->if_poll_mode ==
3320 					    IFNET_MODEL_INPUT_POLL_ON) ?
3321 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3322 					    ifp->if_rxpoll_pmax,
3323 					    ifp->if_rxpoll_plowat,
3324 					    ifp->if_rxpoll_phiwat,
3325 					    ifp->if_rxpoll_wavg,
3326 					    ifp->if_rxpoll_wlowat,
3327 					    ifp->if_rxpoll_whiwat,
3328 					    ifp->if_rxpoll_bavg,
3329 					    ifp->if_rxpoll_blowat,
3330 					    ifp->if_rxpoll_bhiwat);
3331 				}
3332 			}
3333 
3334 			/* Perform mode transition, if necessary */
3335 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3336 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3337 			}
3338 
3339 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3340 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3341 				goto skip;
3342 			}
3343 
3344 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3345 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3346 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3347 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3348 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3349 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3350 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3351 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3352 				mode = IFNET_MODEL_INPUT_POLL_ON;
3353 			}
3354 
3355 			if (mode != ifp->if_poll_mode) {
3356 				ifp->if_poll_mode = mode;
3357 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3358 				poll_req++;
3359 			}
3360 		}
3361 skip:
3362 		notify = dlil_input_stats_sync(ifp, inp);
3363 
3364 		lck_mtx_unlock(&inp->dlth_lock);
3365 
3366 		if (__improbable(embryonic)) {
3367 			ifnet_decr_pending_thread_count(ifp);
3368 		}
3369 
3370 		if (__improbable(notify)) {
3371 			ifnet_notify_data_threshold(ifp);
3372 		}
3373 
3374 		/*
3375 		 * If there's a mode change and interface is still attached,
3376 		 * perform a downcall to the driver for the new mode.  Also
3377 		 * hold an IO refcnt on the interface to prevent it from
3378 		 * being detached (will be release below.)
3379 		 */
3380 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3381 			struct ifnet_model_params p = {
3382 				.model = mode, .reserved = { 0 }
3383 			};
3384 			errno_t err;
3385 
3386 			if (dlil_verbose) {
3387 				DLIL_PRINTF("%s: polling is now %s, "
3388 				    "pkts avg %d max %d limits [%d/%d], "
3389 				    "wreq avg %d limits [%d/%d], "
3390 				    "bytes avg %d limits [%d/%d]\n",
3391 				    if_name(ifp),
3392 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3393 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3394 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3395 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3396 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3397 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3398 				    ifp->if_rxpoll_bhiwat);
3399 			}
3400 
3401 			if ((err = ((*ifp->if_input_ctl)(ifp,
3402 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3403 				DLIL_PRINTF("%s: error setting polling mode "
3404 				    "to %s (%d)\n", if_name(ifp),
3405 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3406 				    "ON" : "OFF", err);
3407 			}
3408 
3409 			switch (mode) {
3410 			case IFNET_MODEL_INPUT_POLL_OFF:
3411 				ifnet_set_poll_cycle(ifp, NULL);
3412 				ifp->if_rxpoll_offreq++;
3413 				if (err != 0) {
3414 					ifp->if_rxpoll_offerr++;
3415 				}
3416 				break;
3417 
3418 			case IFNET_MODEL_INPUT_POLL_ON:
3419 				net_nsectimer(&ival, &ts);
3420 				ifnet_set_poll_cycle(ifp, &ts);
3421 				ifnet_poll(ifp);
3422 				ifp->if_rxpoll_onreq++;
3423 				if (err != 0) {
3424 					ifp->if_rxpoll_onerr++;
3425 				}
3426 				break;
3427 
3428 			default:
3429 				VERIFY(0);
3430 				/* NOTREACHED */
3431 			}
3432 
3433 			/* Release the IO refcnt */
3434 			ifnet_decr_iorefcnt(ifp);
3435 		}
3436 
3437 		/*
3438 		 * NOTE warning %%% attention !!!!
3439 		 * We should think about putting some thread starvation
3440 		 * safeguards if we deal with long chains of packets.
3441 		 */
3442 		if (__probable(m != NULL)) {
3443 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3444 		}
3445 
3446 		lck_mtx_lock_spin(&inp->dlth_lock);
3447 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3448 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3449 		    DLIL_INPUT_TERMINATE))) {
3450 			break;
3451 		}
3452 	}
3453 
3454 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3455 
3456 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3457 terminate:
3458 		lck_mtx_unlock(&inp->dlth_lock);
3459 		dlil_terminate_input_thread(inp);
3460 		/* NOTREACHED */
3461 	} else {
3462 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3463 		lck_mtx_unlock(&inp->dlth_lock);
3464 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3465 		    inp);
3466 		/* NOTREACHED */
3467 	}
3468 
3469 	VERIFY(0);      /* we should never get here */
3470 	/* NOTREACHED */
3471 	__builtin_unreachable();
3472 }
3473 
3474 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3475 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3476 {
3477 	if (p != NULL) {
3478 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3479 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3480 			return EINVAL;
3481 		}
3482 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3483 		    p->packets_lowat >= p->packets_hiwat) {
3484 			return EINVAL;
3485 		}
3486 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3487 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3488 			return EINVAL;
3489 		}
3490 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3491 		    p->bytes_lowat >= p->bytes_hiwat) {
3492 			return EINVAL;
3493 		}
3494 		if (p->interval_time != 0 &&
3495 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3496 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3497 		}
3498 	}
3499 	return 0;
3500 }
3501 
3502 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3503 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3504 {
3505 	u_int64_t sample_holdtime, inbw;
3506 
3507 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3508 		sample_holdtime = 0;    /* polling is disabled */
3509 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3510 		    ifp->if_rxpoll_blowat = 0;
3511 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3512 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3513 		ifp->if_rxpoll_plim = 0;
3514 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3515 	} else {
3516 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3517 		u_int64_t ival;
3518 		unsigned int n, i;
3519 
3520 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3521 			if (inbw < rxpoll_tbl[i].speed) {
3522 				break;
3523 			}
3524 			n = i;
3525 		}
3526 		/* auto-tune if caller didn't specify a value */
3527 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3528 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3529 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3530 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3531 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3532 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3533 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3534 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3535 		plim = ((p == NULL || p->packets_limit == 0 ||
3536 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3537 		ival = ((p == NULL || p->interval_time == 0 ||
3538 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3539 		    if_rxpoll_interval_time : p->interval_time);
3540 
3541 		VERIFY(plowat != 0 && phiwat != 0);
3542 		VERIFY(blowat != 0 && bhiwat != 0);
3543 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3544 
3545 		sample_holdtime = if_rxpoll_sample_holdtime;
3546 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3547 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3548 		ifp->if_rxpoll_plowat = plowat;
3549 		ifp->if_rxpoll_phiwat = phiwat;
3550 		ifp->if_rxpoll_blowat = blowat;
3551 		ifp->if_rxpoll_bhiwat = bhiwat;
3552 		ifp->if_rxpoll_plim = plim;
3553 		ifp->if_rxpoll_ival = ival;
3554 	}
3555 
3556 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3557 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3558 
3559 	if (dlil_verbose) {
3560 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3561 		    "poll interval %llu nsec, pkts per poll %u, "
3562 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3563 		    "bytes limits [%u/%u]\n", if_name(ifp),
3564 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3565 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3566 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3567 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3568 		    ifp->if_rxpoll_bhiwat);
3569 	}
3570 }
3571 
3572 /*
3573  * Must be called on an attached ifnet (caller is expected to check.)
3574  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3575  */
3576 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3577 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3578     boolean_t locked)
3579 {
3580 	errno_t err;
3581 	struct dlil_threading_info *inp;
3582 
3583 	VERIFY(ifp != NULL);
3584 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3585 		return ENXIO;
3586 	}
3587 	err = dlil_rxpoll_validate_params(p);
3588 	if (err != 0) {
3589 		return err;
3590 	}
3591 
3592 	if (!locked) {
3593 		lck_mtx_lock(&inp->dlth_lock);
3594 	}
3595 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3596 	/*
3597 	 * Normally, we'd reset the parameters to the auto-tuned values
3598 	 * if the the input thread detects a change in link rate.  If the
3599 	 * driver provides its own parameters right after a link rate
3600 	 * changes, but before the input thread gets to run, we want to
3601 	 * make sure to keep the driver's values.  Clearing if_poll_update
3602 	 * will achieve that.
3603 	 */
3604 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3605 		ifp->if_poll_update = 0;
3606 	}
3607 	dlil_rxpoll_update_params(ifp, p);
3608 	if (!locked) {
3609 		lck_mtx_unlock(&inp->dlth_lock);
3610 	}
3611 	return 0;
3612 }
3613 
3614 /*
3615  * Must be called on an attached ifnet (caller is expected to check.)
3616  */
3617 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3618 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3619 {
3620 	struct dlil_threading_info *inp;
3621 
3622 	VERIFY(ifp != NULL && p != NULL);
3623 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3624 		return ENXIO;
3625 	}
3626 
3627 	bzero(p, sizeof(*p));
3628 
3629 	lck_mtx_lock(&inp->dlth_lock);
3630 	p->packets_limit = ifp->if_rxpoll_plim;
3631 	p->packets_lowat = ifp->if_rxpoll_plowat;
3632 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3633 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3634 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3635 	p->interval_time = ifp->if_rxpoll_ival;
3636 	lck_mtx_unlock(&inp->dlth_lock);
3637 
3638 	return 0;
3639 }
3640 
3641 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3642 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3643     const struct ifnet_stat_increment_param *s)
3644 {
3645 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3646 }
3647 
3648 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3649 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3650     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3651 {
3652 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3653 }
3654 
3655 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3656 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3657     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3658 {
3659 	return ifnet_input_common(ifp, m_head, m_tail, s,
3660 	           (m_head != NULL), TRUE);
3661 }
3662 
3663 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3664 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3665     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3666 {
3667 	dlil_input_func input_func;
3668 	struct ifnet_stat_increment_param _s;
3669 	u_int32_t m_cnt = 0, m_size = 0;
3670 	struct mbuf *last;
3671 	errno_t err = 0;
3672 
3673 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3674 		if (m_head != NULL) {
3675 			mbuf_freem_list(m_head);
3676 		}
3677 		return EINVAL;
3678 	}
3679 
3680 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3681 	VERIFY(m_tail == NULL || ext);
3682 	VERIFY(s != NULL || !ext);
3683 
3684 	/*
3685 	 * Drop the packet(s) if the parameters are invalid, or if the
3686 	 * interface is no longer attached; else hold an IO refcnt to
3687 	 * prevent it from being detached (will be released below.)
3688 	 */
3689 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3690 		if (m_head != NULL) {
3691 			mbuf_freem_list(m_head);
3692 		}
3693 		return EINVAL;
3694 	}
3695 
3696 	input_func = ifp->if_input_dlil;
3697 	VERIFY(input_func != NULL);
3698 
3699 	if (m_tail == NULL) {
3700 		last = m_head;
3701 		while (m_head != NULL) {
3702 #if IFNET_INPUT_SANITY_CHK
3703 			if (__improbable(dlil_input_sanity_check != 0)) {
3704 				DLIL_INPUT_CHECK(last, ifp);
3705 			}
3706 #endif /* IFNET_INPUT_SANITY_CHK */
3707 			m_cnt++;
3708 			m_size += m_length(last);
3709 			if (mbuf_nextpkt(last) == NULL) {
3710 				break;
3711 			}
3712 			last = mbuf_nextpkt(last);
3713 		}
3714 		m_tail = last;
3715 	} else {
3716 #if IFNET_INPUT_SANITY_CHK
3717 		if (__improbable(dlil_input_sanity_check != 0)) {
3718 			last = m_head;
3719 			while (1) {
3720 				DLIL_INPUT_CHECK(last, ifp);
3721 				m_cnt++;
3722 				m_size += m_length(last);
3723 				if (mbuf_nextpkt(last) == NULL) {
3724 					break;
3725 				}
3726 				last = mbuf_nextpkt(last);
3727 			}
3728 		} else {
3729 			m_cnt = s->packets_in;
3730 			m_size = s->bytes_in;
3731 			last = m_tail;
3732 		}
3733 #else
3734 		m_cnt = s->packets_in;
3735 		m_size = s->bytes_in;
3736 		last = m_tail;
3737 #endif /* IFNET_INPUT_SANITY_CHK */
3738 	}
3739 
3740 	if (last != m_tail) {
3741 		panic_plain("%s: invalid input packet chain for %s, "
3742 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3743 		    m_tail, last);
3744 	}
3745 
3746 	/*
3747 	 * Assert packet count only for the extended variant, for backwards
3748 	 * compatibility, since this came directly from the device driver.
3749 	 * Relax this assertion for input bytes, as the driver may have
3750 	 * included the link-layer headers in the computation; hence
3751 	 * m_size is just an approximation.
3752 	 */
3753 	if (ext && s->packets_in != m_cnt) {
3754 		panic_plain("%s: input packet count mismatch for %s, "
3755 		    "%d instead of %d\n", __func__, if_name(ifp),
3756 		    s->packets_in, m_cnt);
3757 	}
3758 
3759 	if (s == NULL) {
3760 		bzero(&_s, sizeof(_s));
3761 		s = &_s;
3762 	} else {
3763 		_s = *s;
3764 	}
3765 	_s.packets_in = m_cnt;
3766 	_s.bytes_in = m_size;
3767 
3768 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3769 
3770 	if (ifp != lo_ifp) {
3771 		/* Release the IO refcnt */
3772 		ifnet_datamov_end(ifp);
3773 	}
3774 
3775 	return err;
3776 }
3777 
3778 #if SKYWALK
3779 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3780 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3781 {
3782 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3783 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3784 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3785 }
3786 
3787 void
dlil_reset_input_handler(struct ifnet * ifp)3788 dlil_reset_input_handler(struct ifnet *ifp)
3789 {
3790 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3791 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3792 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3793 		;
3794 	}
3795 }
3796 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3797 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3798 {
3799 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3800 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3801 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3802 }
3803 
3804 void
dlil_reset_output_handler(struct ifnet * ifp)3805 dlil_reset_output_handler(struct ifnet *ifp)
3806 {
3807 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3808 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3809 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3810 		;
3811 	}
3812 }
3813 #endif /* SKYWALK */
3814 
3815 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3816 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3817 {
3818 	return ifp->if_output(ifp, m);
3819 }
3820 
3821 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3822 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3823     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3824     boolean_t poll, struct thread *tp)
3825 {
3826 	struct dlil_threading_info *inp = ifp->if_inp;
3827 
3828 	if (__improbable(inp == NULL)) {
3829 		inp = dlil_main_input_thread;
3830 	}
3831 
3832 	return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3833 }
3834 
3835 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3836 dlil_input_async(struct dlil_threading_info *inp,
3837     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3838     const struct ifnet_stat_increment_param *s, boolean_t poll,
3839     struct thread *tp)
3840 {
3841 	u_int32_t m_cnt = s->packets_in;
3842 	u_int32_t m_size = s->bytes_in;
3843 	boolean_t notify = FALSE;
3844 
3845 	/*
3846 	 * If there is a matching DLIL input thread associated with an
3847 	 * affinity set, associate this thread with the same set.  We
3848 	 * will only do this once.
3849 	 */
3850 	lck_mtx_lock_spin(&inp->dlth_lock);
3851 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3852 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3853 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3854 		u_int32_t tag = inp->dlth_affinity_tag;
3855 
3856 		if (poll) {
3857 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3858 			inp->dlth_poller_thread = tp;
3859 		} else {
3860 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3861 			inp->dlth_driver_thread = tp;
3862 		}
3863 		lck_mtx_unlock(&inp->dlth_lock);
3864 
3865 		/* Associate the current thread with the new affinity tag */
3866 		(void) dlil_affinity_set(tp, tag);
3867 
3868 		/*
3869 		 * Take a reference on the current thread; during detach,
3870 		 * we will need to refer to it in order to tear down its
3871 		 * affinity.
3872 		 */
3873 		thread_reference(tp);
3874 		lck_mtx_lock_spin(&inp->dlth_lock);
3875 	}
3876 
3877 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3878 
3879 	/*
3880 	 * Because of loopbacked multicast we cannot stuff the ifp in
3881 	 * the rcvif of the packet header: loopback (lo0) packets use a
3882 	 * dedicated list so that we can later associate them with lo_ifp
3883 	 * on their way up the stack.  Packets for other interfaces without
3884 	 * dedicated input threads go to the regular list.
3885 	 */
3886 	if (m_head != NULL) {
3887 		classq_pkt_t head, tail;
3888 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
3889 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3890 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3891 			struct dlil_main_threading_info *inpm =
3892 			    (struct dlil_main_threading_info *)inp;
3893 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
3894 			    m_cnt, m_size);
3895 		} else {
3896 			_addq_multi(&inp->dlth_pkts, &head, &tail,
3897 			    m_cnt, m_size);
3898 		}
3899 	}
3900 
3901 #if IFNET_INPUT_SANITY_CHK
3902 	if (__improbable(dlil_input_sanity_check != 0)) {
3903 		u_int32_t count = 0, size = 0;
3904 		struct mbuf *m0;
3905 
3906 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3907 			size += m_length(m0);
3908 			count++;
3909 		}
3910 
3911 		if (count != m_cnt) {
3912 			panic_plain("%s: invalid total packet count %u "
3913 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3914 			/* NOTREACHED */
3915 			__builtin_unreachable();
3916 		} else if (size != m_size) {
3917 			panic_plain("%s: invalid total packet size %u "
3918 			    "(expected %u)\n", if_name(ifp), size, m_size);
3919 			/* NOTREACHED */
3920 			__builtin_unreachable();
3921 		}
3922 
3923 		inp->dlth_pkts_cnt += m_cnt;
3924 	}
3925 #endif /* IFNET_INPUT_SANITY_CHK */
3926 
3927 	dlil_input_stats_add(s, inp, ifp, poll);
3928 	/*
3929 	 * If we're using the main input thread, synchronize the
3930 	 * stats now since we have the interface context.  All
3931 	 * other cases involving dedicated input threads will
3932 	 * have their stats synchronized there.
3933 	 */
3934 	if (inp == dlil_main_input_thread) {
3935 		notify = dlil_input_stats_sync(ifp, inp);
3936 	}
3937 
3938 	dlil_input_wakeup(inp);
3939 	lck_mtx_unlock(&inp->dlth_lock);
3940 
3941 	if (notify) {
3942 		ifnet_notify_data_threshold(ifp);
3943 	}
3944 
3945 	return 0;
3946 }
3947 
3948 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3949 dlil_input_sync(struct dlil_threading_info *inp,
3950     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3951     const struct ifnet_stat_increment_param *s, boolean_t poll,
3952     struct thread *tp)
3953 {
3954 #pragma unused(tp)
3955 	u_int32_t m_cnt = s->packets_in;
3956 	u_int32_t m_size = s->bytes_in;
3957 	boolean_t notify = FALSE;
3958 	classq_pkt_t head, tail;
3959 
3960 	ASSERT(inp != dlil_main_input_thread);
3961 
3962 	/* XXX: should we just assert instead? */
3963 	if (__improbable(m_head == NULL)) {
3964 		return 0;
3965 	}
3966 
3967 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
3968 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3969 
3970 	lck_mtx_lock_spin(&inp->dlth_lock);
3971 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
3972 
3973 #if IFNET_INPUT_SANITY_CHK
3974 	if (__improbable(dlil_input_sanity_check != 0)) {
3975 		u_int32_t count = 0, size = 0;
3976 		struct mbuf *m0;
3977 
3978 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3979 			size += m_length(m0);
3980 			count++;
3981 		}
3982 
3983 		if (count != m_cnt) {
3984 			panic_plain("%s: invalid total packet count %u "
3985 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3986 			/* NOTREACHED */
3987 			__builtin_unreachable();
3988 		} else if (size != m_size) {
3989 			panic_plain("%s: invalid total packet size %u "
3990 			    "(expected %u)\n", if_name(ifp), size, m_size);
3991 			/* NOTREACHED */
3992 			__builtin_unreachable();
3993 		}
3994 
3995 		inp->dlth_pkts_cnt += m_cnt;
3996 	}
3997 #endif /* IFNET_INPUT_SANITY_CHK */
3998 
3999 	dlil_input_stats_add(s, inp, ifp, poll);
4000 
4001 	m_cnt = qlen(&inp->dlth_pkts);
4002 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4003 
4004 #if SKYWALK
4005 	/*
4006 	 * If this interface is attached to a netif nexus,
4007 	 * the stats are already incremented there; otherwise
4008 	 * do it here.
4009 	 */
4010 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4011 #endif /* SKYWALK */
4012 	notify = dlil_input_stats_sync(ifp, inp);
4013 
4014 	lck_mtx_unlock(&inp->dlth_lock);
4015 
4016 	if (notify) {
4017 		ifnet_notify_data_threshold(ifp);
4018 	}
4019 
4020 	/*
4021 	 * NOTE warning %%% attention !!!!
4022 	 * We should think about putting some thread starvation
4023 	 * safeguards if we deal with long chains of packets.
4024 	 */
4025 	if (head.cp_mbuf != NULL) {
4026 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4027 		    m_cnt, ifp->if_poll_mode);
4028 	}
4029 
4030 	return 0;
4031 }
4032 
4033 #if SKYWALK
4034 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4035 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4036 {
4037 	return atomic_test_set_ptr(&ifp->if_output,
4038 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4039 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4040 }
4041 
4042 void
ifnet_reset_output_handler(struct ifnet * ifp)4043 ifnet_reset_output_handler(struct ifnet *ifp)
4044 {
4045 	while (!atomic_test_set_ptr(&ifp->if_output,
4046 	    ptrauth_nop_cast(void *, ifp->if_output),
4047 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4048 		;
4049 	}
4050 }
4051 
4052 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4053 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4054 {
4055 	return atomic_test_set_ptr(&ifp->if_start,
4056 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4057 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4058 }
4059 
4060 void
ifnet_reset_start_handler(struct ifnet * ifp)4061 ifnet_reset_start_handler(struct ifnet *ifp)
4062 {
4063 	while (!atomic_test_set_ptr(&ifp->if_start,
4064 	    ptrauth_nop_cast(void *, ifp->if_start),
4065 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4066 		;
4067 	}
4068 }
4069 #endif /* SKYWALK */
4070 
4071 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4072 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4073 {
4074 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4075 		return;
4076 	}
4077 	/*
4078 	 * If the starter thread is inactive, signal it to do work,
4079 	 * unless the interface is being flow controlled from below,
4080 	 * e.g. a virtual interface being flow controlled by a real
4081 	 * network interface beneath it, or it's been disabled via
4082 	 * a call to ifnet_disable_output().
4083 	 */
4084 	lck_mtx_lock_spin(&ifp->if_start_lock);
4085 	if (resetfc) {
4086 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4087 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4088 		lck_mtx_unlock(&ifp->if_start_lock);
4089 		return;
4090 	}
4091 	ifp->if_start_req++;
4092 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4093 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4094 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4095 	    ifp->if_start_delayed == 0)) {
4096 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4097 	}
4098 	lck_mtx_unlock(&ifp->if_start_lock);
4099 }
4100 
4101 void
ifnet_start(struct ifnet * ifp)4102 ifnet_start(struct ifnet *ifp)
4103 {
4104 	ifnet_start_common(ifp, FALSE);
4105 }
4106 
4107 __attribute__((noreturn))
4108 static void
ifnet_start_thread_func(void * v,wait_result_t w)4109 ifnet_start_thread_func(void *v, wait_result_t w)
4110 {
4111 #pragma unused(w)
4112 	struct ifnet *ifp = v;
4113 	char thread_name[MAXTHREADNAMESIZE];
4114 
4115 	/* Construct the name for this thread, and then apply it. */
4116 	bzero(thread_name, sizeof(thread_name));
4117 	(void) snprintf(thread_name, sizeof(thread_name),
4118 	    "ifnet_start_%s", ifp->if_xname);
4119 #if SKYWALK
4120 	/* override name for native Skywalk interface */
4121 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4122 		(void) snprintf(thread_name, sizeof(thread_name),
4123 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4124 	}
4125 #endif /* SKYWALK */
4126 	ASSERT(ifp->if_start_thread == current_thread());
4127 	thread_set_thread_name(current_thread(), thread_name);
4128 
4129 	/*
4130 	 * Treat the dedicated starter thread for lo0 as equivalent to
4131 	 * the driver workloop thread; if net_affinity is enabled for
4132 	 * the main input thread, associate this starter thread to it
4133 	 * by binding them with the same affinity tag.  This is done
4134 	 * only once (as we only have one lo_ifp which never goes away.)
4135 	 */
4136 	if (ifp == lo_ifp) {
4137 		struct dlil_threading_info *inp = dlil_main_input_thread;
4138 		struct thread *tp = current_thread();
4139 #if SKYWALK
4140 		/* native skywalk loopback not yet implemented */
4141 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4142 #endif /* SKYWALK */
4143 
4144 		lck_mtx_lock(&inp->dlth_lock);
4145 		if (inp->dlth_affinity) {
4146 			u_int32_t tag = inp->dlth_affinity_tag;
4147 
4148 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4149 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4150 			inp->dlth_driver_thread = tp;
4151 			lck_mtx_unlock(&inp->dlth_lock);
4152 
4153 			/* Associate this thread with the affinity tag */
4154 			(void) dlil_affinity_set(tp, tag);
4155 		} else {
4156 			lck_mtx_unlock(&inp->dlth_lock);
4157 		}
4158 	}
4159 
4160 	lck_mtx_lock(&ifp->if_start_lock);
4161 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4162 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4163 	ifp->if_start_embryonic = 1;
4164 	/* wake up once to get out of embryonic state */
4165 	ifp->if_start_req++;
4166 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4167 	lck_mtx_unlock(&ifp->if_start_lock);
4168 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4169 	/* NOTREACHED */
4170 	__builtin_unreachable();
4171 }
4172 
4173 __attribute__((noreturn))
4174 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4175 ifnet_start_thread_cont(void *v, wait_result_t wres)
4176 {
4177 	struct ifnet *ifp = v;
4178 	struct ifclassq *ifq = ifp->if_snd;
4179 
4180 	lck_mtx_lock_spin(&ifp->if_start_lock);
4181 	if (__improbable(wres == THREAD_INTERRUPTED ||
4182 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4183 		goto terminate;
4184 	}
4185 
4186 	if (__improbable(ifp->if_start_embryonic)) {
4187 		ifp->if_start_embryonic = 0;
4188 		lck_mtx_unlock(&ifp->if_start_lock);
4189 		ifnet_decr_pending_thread_count(ifp);
4190 		lck_mtx_lock_spin(&ifp->if_start_lock);
4191 		goto skip;
4192 	}
4193 
4194 	ifp->if_start_active = 1;
4195 
4196 	/*
4197 	 * Keep on servicing until no more request.
4198 	 */
4199 	for (;;) {
4200 		u_int32_t req = ifp->if_start_req;
4201 		if (!IFCQ_IS_EMPTY(ifq) &&
4202 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4203 		    ifp->if_start_delayed == 0 &&
4204 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4205 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4206 			ifp->if_start_delayed = 1;
4207 			ifnet_start_delayed++;
4208 			break;
4209 		}
4210 		ifp->if_start_delayed = 0;
4211 		lck_mtx_unlock(&ifp->if_start_lock);
4212 
4213 		/*
4214 		 * If no longer attached, don't call start because ifp
4215 		 * is being destroyed; else hold an IO refcnt to
4216 		 * prevent the interface from being detached (will be
4217 		 * released below.)
4218 		 */
4219 		if (!ifnet_datamov_begin(ifp)) {
4220 			lck_mtx_lock_spin(&ifp->if_start_lock);
4221 			break;
4222 		}
4223 
4224 		/* invoke the driver's start routine */
4225 		((*ifp->if_start)(ifp));
4226 
4227 		/*
4228 		 * Release the io ref count taken above.
4229 		 */
4230 		ifnet_datamov_end(ifp);
4231 
4232 		lck_mtx_lock_spin(&ifp->if_start_lock);
4233 
4234 		/*
4235 		 * If there's no pending request or if the
4236 		 * interface has been disabled, we're done.
4237 		 */
4238 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4239 		if (req == ifp->if_start_req ||
4240 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4241 			break;
4242 		}
4243 	}
4244 skip:
4245 	ifp->if_start_req = 0;
4246 	ifp->if_start_active = 0;
4247 
4248 #if SKYWALK
4249 	/*
4250 	 * Wakeup any waiters, e.g. any threads waiting to
4251 	 * detach the interface from the flowswitch, etc.
4252 	 */
4253 	if (ifp->if_start_waiters != 0) {
4254 		ifp->if_start_waiters = 0;
4255 		wakeup(&ifp->if_start_waiters);
4256 	}
4257 #endif /* SKYWALK */
4258 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4259 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4260 		struct timespec delay_start_ts;
4261 		struct timespec *ts;
4262 
4263 		/*
4264 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4265 		 * there are still packets in the send queue which haven't
4266 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4267 		 * until ifnet_start() is called again.
4268 		 */
4269 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4270 		    &ifp->if_start_cycle : NULL);
4271 
4272 		if (ts == NULL && ifp->if_start_delayed == 1) {
4273 			delay_start_ts.tv_sec = 0;
4274 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4275 			ts = &delay_start_ts;
4276 		}
4277 
4278 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4279 			ts = NULL;
4280 		}
4281 
4282 		if (__improbable(ts != NULL)) {
4283 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4284 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4285 		}
4286 
4287 		(void) assert_wait_deadline(&ifp->if_start_thread,
4288 		    THREAD_UNINT, deadline);
4289 		lck_mtx_unlock(&ifp->if_start_lock);
4290 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4291 		/* NOTREACHED */
4292 	} else {
4293 terminate:
4294 		/* interface is detached? */
4295 		ifnet_set_start_cycle(ifp, NULL);
4296 
4297 		/* clear if_start_thread to allow termination to continue */
4298 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4299 		ifp->if_start_thread = THREAD_NULL;
4300 		wakeup((caddr_t)&ifp->if_start_thread);
4301 		lck_mtx_unlock(&ifp->if_start_lock);
4302 
4303 		if (dlil_verbose) {
4304 			DLIL_PRINTF("%s: starter thread terminated\n",
4305 			    if_name(ifp));
4306 		}
4307 
4308 		/* for the extra refcnt from kernel_thread_start() */
4309 		thread_deallocate(current_thread());
4310 		/* this is the end */
4311 		thread_terminate(current_thread());
4312 		/* NOTREACHED */
4313 	}
4314 
4315 	/* must never get here */
4316 	VERIFY(0);
4317 	/* NOTREACHED */
4318 	__builtin_unreachable();
4319 }
4320 
4321 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4322 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4323 {
4324 	if (ts == NULL) {
4325 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4326 	} else {
4327 		*(&ifp->if_start_cycle) = *ts;
4328 	}
4329 
4330 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4331 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4332 		    if_name(ifp), ts->tv_nsec);
4333 	}
4334 }
4335 
4336 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4337 ifnet_poll_wakeup(struct ifnet *ifp)
4338 {
4339 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4340 
4341 	ifp->if_poll_req++;
4342 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4343 	    ifp->if_poll_thread != THREAD_NULL) {
4344 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4345 	}
4346 }
4347 
4348 void
ifnet_poll(struct ifnet * ifp)4349 ifnet_poll(struct ifnet *ifp)
4350 {
4351 	/*
4352 	 * If the poller thread is inactive, signal it to do work.
4353 	 */
4354 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4355 	ifnet_poll_wakeup(ifp);
4356 	lck_mtx_unlock(&ifp->if_poll_lock);
4357 }
4358 
4359 __attribute__((noreturn))
4360 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4361 ifnet_poll_thread_func(void *v, wait_result_t w)
4362 {
4363 #pragma unused(w)
4364 	char thread_name[MAXTHREADNAMESIZE];
4365 	struct ifnet *ifp = v;
4366 
4367 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4368 	VERIFY(current_thread() == ifp->if_poll_thread);
4369 
4370 	/* construct the name for this thread, and then apply it */
4371 	bzero(thread_name, sizeof(thread_name));
4372 	(void) snprintf(thread_name, sizeof(thread_name),
4373 	    "ifnet_poller_%s", ifp->if_xname);
4374 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4375 
4376 	lck_mtx_lock(&ifp->if_poll_lock);
4377 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4378 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4379 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4380 	/* wake up once to get out of embryonic state */
4381 	ifnet_poll_wakeup(ifp);
4382 	lck_mtx_unlock(&ifp->if_poll_lock);
4383 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4384 	/* NOTREACHED */
4385 	__builtin_unreachable();
4386 }
4387 
4388 __attribute__((noreturn))
4389 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4390 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4391 {
4392 	struct dlil_threading_info *inp;
4393 	struct ifnet *ifp = v;
4394 	struct ifnet_stat_increment_param s;
4395 	struct timespec start_time;
4396 
4397 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4398 
4399 	bzero(&s, sizeof(s));
4400 	net_timerclear(&start_time);
4401 
4402 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4403 	if (__improbable(wres == THREAD_INTERRUPTED ||
4404 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4405 		goto terminate;
4406 	}
4407 
4408 	inp = ifp->if_inp;
4409 	VERIFY(inp != NULL);
4410 
4411 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4412 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4413 		lck_mtx_unlock(&ifp->if_poll_lock);
4414 		ifnet_decr_pending_thread_count(ifp);
4415 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4416 		goto skip;
4417 	}
4418 
4419 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4420 
4421 	/*
4422 	 * Keep on servicing until no more request.
4423 	 */
4424 	for (;;) {
4425 		struct mbuf *m_head, *m_tail;
4426 		u_int32_t m_lim, m_cnt, m_totlen;
4427 		u_int16_t req = ifp->if_poll_req;
4428 
4429 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4430 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4431 		lck_mtx_unlock(&ifp->if_poll_lock);
4432 
4433 		/*
4434 		 * If no longer attached, there's nothing to do;
4435 		 * else hold an IO refcnt to prevent the interface
4436 		 * from being detached (will be released below.)
4437 		 */
4438 		if (!ifnet_is_attached(ifp, 1)) {
4439 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4440 			break;
4441 		}
4442 
4443 		if (dlil_verbose > 1) {
4444 			DLIL_PRINTF("%s: polling up to %d pkts, "
4445 			    "pkts avg %d max %d, wreq avg %d, "
4446 			    "bytes avg %d\n",
4447 			    if_name(ifp), m_lim,
4448 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4449 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4450 		}
4451 
4452 		/* invoke the driver's input poll routine */
4453 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4454 		&m_cnt, &m_totlen));
4455 
4456 		if (m_head != NULL) {
4457 			VERIFY(m_tail != NULL && m_cnt > 0);
4458 
4459 			if (dlil_verbose > 1) {
4460 				DLIL_PRINTF("%s: polled %d pkts, "
4461 				    "pkts avg %d max %d, wreq avg %d, "
4462 				    "bytes avg %d\n",
4463 				    if_name(ifp), m_cnt,
4464 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4465 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4466 			}
4467 
4468 			/* stats are required for extended variant */
4469 			s.packets_in = m_cnt;
4470 			s.bytes_in = m_totlen;
4471 
4472 			(void) ifnet_input_common(ifp, m_head, m_tail,
4473 			    &s, TRUE, TRUE);
4474 		} else {
4475 			if (dlil_verbose > 1) {
4476 				DLIL_PRINTF("%s: no packets, "
4477 				    "pkts avg %d max %d, wreq avg %d, "
4478 				    "bytes avg %d\n",
4479 				    if_name(ifp), ifp->if_rxpoll_pavg,
4480 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4481 				    ifp->if_rxpoll_bavg);
4482 			}
4483 
4484 			(void) ifnet_input_common(ifp, NULL, NULL,
4485 			    NULL, FALSE, TRUE);
4486 		}
4487 
4488 		/* Release the io ref count */
4489 		ifnet_decr_iorefcnt(ifp);
4490 
4491 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4492 
4493 		/* if there's no pending request, we're done */
4494 		if (req == ifp->if_poll_req ||
4495 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4496 			break;
4497 		}
4498 	}
4499 skip:
4500 	ifp->if_poll_req = 0;
4501 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4502 
4503 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4504 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4505 		struct timespec *ts;
4506 
4507 		/*
4508 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4509 		 * until ifnet_poll() is called again.
4510 		 */
4511 		ts = &ifp->if_poll_cycle;
4512 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4513 			ts = NULL;
4514 		}
4515 
4516 		if (ts != NULL) {
4517 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4518 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4519 		}
4520 
4521 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4522 		    THREAD_UNINT, deadline);
4523 		lck_mtx_unlock(&ifp->if_poll_lock);
4524 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4525 		/* NOTREACHED */
4526 	} else {
4527 terminate:
4528 		/* interface is detached (maybe while asleep)? */
4529 		ifnet_set_poll_cycle(ifp, NULL);
4530 
4531 		/* clear if_poll_thread to allow termination to continue */
4532 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4533 		ifp->if_poll_thread = THREAD_NULL;
4534 		wakeup((caddr_t)&ifp->if_poll_thread);
4535 		lck_mtx_unlock(&ifp->if_poll_lock);
4536 
4537 		if (dlil_verbose) {
4538 			DLIL_PRINTF("%s: poller thread terminated\n",
4539 			    if_name(ifp));
4540 		}
4541 
4542 		/* for the extra refcnt from kernel_thread_start() */
4543 		thread_deallocate(current_thread());
4544 		/* this is the end */
4545 		thread_terminate(current_thread());
4546 		/* NOTREACHED */
4547 	}
4548 
4549 	/* must never get here */
4550 	VERIFY(0);
4551 	/* NOTREACHED */
4552 	__builtin_unreachable();
4553 }
4554 
4555 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4556 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4557 {
4558 	if (ts == NULL) {
4559 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4560 	} else {
4561 		*(&ifp->if_poll_cycle) = *ts;
4562 	}
4563 
4564 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4565 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4566 		    if_name(ifp), ts->tv_nsec);
4567 	}
4568 }
4569 
4570 void
ifnet_purge(struct ifnet * ifp)4571 ifnet_purge(struct ifnet *ifp)
4572 {
4573 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4574 		if_qflush_snd(ifp, false);
4575 	}
4576 }
4577 
4578 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4579 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4580 {
4581 	IFCQ_LOCK_ASSERT_HELD(ifq);
4582 
4583 	if (!(IFCQ_IS_READY(ifq))) {
4584 		return;
4585 	}
4586 
4587 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4588 		struct tb_profile tb = {
4589 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4590 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4591 		};
4592 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4593 	}
4594 
4595 	ifclassq_update(ifq, ev);
4596 }
4597 
4598 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4599 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4600 {
4601 	switch (ev) {
4602 	case CLASSQ_EV_LINK_BANDWIDTH:
4603 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4604 			ifp->if_poll_update++;
4605 		}
4606 		break;
4607 
4608 	default:
4609 		break;
4610 	}
4611 }
4612 
4613 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4614 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4615 {
4616 	struct ifclassq *ifq;
4617 	u_int32_t omodel;
4618 	errno_t err;
4619 
4620 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4621 		return EINVAL;
4622 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4623 		return ENXIO;
4624 	}
4625 
4626 	ifq = ifp->if_snd;
4627 	IFCQ_LOCK(ifq);
4628 	omodel = ifp->if_output_sched_model;
4629 	ifp->if_output_sched_model = model;
4630 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4631 		ifp->if_output_sched_model = omodel;
4632 	}
4633 	IFCQ_UNLOCK(ifq);
4634 
4635 	return err;
4636 }
4637 
4638 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4639 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4640 {
4641 	if (ifp == NULL) {
4642 		return EINVAL;
4643 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4644 		return ENXIO;
4645 	}
4646 
4647 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4648 
4649 	return 0;
4650 }
4651 
4652 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4653 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4654 {
4655 	if (ifp == NULL || maxqlen == NULL) {
4656 		return EINVAL;
4657 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4658 		return ENXIO;
4659 	}
4660 
4661 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4662 
4663 	return 0;
4664 }
4665 
4666 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4667 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4668 {
4669 	errno_t err;
4670 
4671 	if (ifp == NULL || pkts == NULL) {
4672 		err = EINVAL;
4673 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4674 		err = ENXIO;
4675 	} else {
4676 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4677 		    pkts, NULL);
4678 	}
4679 
4680 	return err;
4681 }
4682 
4683 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4684 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4685     u_int32_t *pkts, u_int32_t *bytes)
4686 {
4687 	errno_t err;
4688 
4689 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4690 	    (pkts == NULL && bytes == NULL)) {
4691 		err = EINVAL;
4692 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4693 		err = ENXIO;
4694 	} else {
4695 		err = ifclassq_get_len(ifp->if_snd, sc, pkts, bytes);
4696 	}
4697 
4698 	return err;
4699 }
4700 
4701 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4702 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4703 {
4704 	struct dlil_threading_info *inp;
4705 
4706 	if (ifp == NULL) {
4707 		return EINVAL;
4708 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4709 		return ENXIO;
4710 	}
4711 
4712 	if (maxqlen == 0) {
4713 		maxqlen = if_rcvq_maxlen;
4714 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4715 		maxqlen = IF_RCVQ_MINLEN;
4716 	}
4717 
4718 	inp = ifp->if_inp;
4719 	lck_mtx_lock(&inp->dlth_lock);
4720 	qlimit(&inp->dlth_pkts) = maxqlen;
4721 	lck_mtx_unlock(&inp->dlth_lock);
4722 
4723 	return 0;
4724 }
4725 
4726 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4727 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4728 {
4729 	struct dlil_threading_info *inp;
4730 
4731 	if (ifp == NULL || maxqlen == NULL) {
4732 		return EINVAL;
4733 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4734 		return ENXIO;
4735 	}
4736 
4737 	inp = ifp->if_inp;
4738 	lck_mtx_lock(&inp->dlth_lock);
4739 	*maxqlen = qlimit(&inp->dlth_pkts);
4740 	lck_mtx_unlock(&inp->dlth_lock);
4741 	return 0;
4742 }
4743 
4744 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4745 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4746     uint16_t delay_timeout)
4747 {
4748 	if (delay_qlen > 0 && delay_timeout > 0) {
4749 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4750 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4751 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4752 		/* convert timeout to nanoseconds */
4753 		ifp->if_start_delay_timeout *= 1000;
4754 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4755 		    ifp->if_xname, (uint32_t)delay_qlen,
4756 		    (uint32_t)delay_timeout);
4757 	} else {
4758 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4759 	}
4760 }
4761 
4762 /*
4763  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4764  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4765  * buf holds the full header.
4766  */
4767 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4768 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4769 {
4770 	struct ip *ip;
4771 	struct ip6_hdr *ip6;
4772 	uint8_t lbuf[64] __attribute__((aligned(8)));
4773 	uint8_t *p = buf;
4774 
4775 	if (ip_ver == IPVERSION) {
4776 		uint8_t old_tos;
4777 		uint32_t sum;
4778 
4779 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4780 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4781 			bcopy(buf, lbuf, sizeof(struct ip));
4782 			p = lbuf;
4783 		}
4784 		ip = (struct ip *)(void *)p;
4785 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4786 			return;
4787 		}
4788 
4789 		DTRACE_IP1(clear__v4, struct ip *, ip);
4790 		old_tos = ip->ip_tos;
4791 		ip->ip_tos &= IPTOS_ECN_MASK;
4792 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4793 		sum = (sum >> 16) + (sum & 0xffff);
4794 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4795 
4796 		if (__improbable(p == lbuf)) {
4797 			bcopy(lbuf, buf, sizeof(struct ip));
4798 		}
4799 	} else {
4800 		uint32_t flow;
4801 		ASSERT(ip_ver == IPV6_VERSION);
4802 
4803 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4804 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4805 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4806 			p = lbuf;
4807 		}
4808 		ip6 = (struct ip6_hdr *)(void *)p;
4809 		flow = ntohl(ip6->ip6_flow);
4810 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4811 			return;
4812 		}
4813 
4814 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4815 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4816 
4817 		if (__improbable(p == lbuf)) {
4818 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4819 		}
4820 	}
4821 }
4822 
4823 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4824 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4825     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4826 {
4827 #if SKYWALK
4828 	volatile struct sk_nexusadv *nxadv = NULL;
4829 #endif /* SKYWALK */
4830 	volatile uint64_t *fg_ts = NULL;
4831 	volatile uint64_t *rt_ts = NULL;
4832 	struct timespec now;
4833 	u_int64_t now_nsec = 0;
4834 	int error = 0;
4835 	uint8_t *mcast_buf = NULL;
4836 	uint8_t ip_ver;
4837 	uint32_t pktlen;
4838 
4839 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4840 #if SKYWALK
4841 	/*
4842 	 * If attached to flowswitch, grab pointers to the
4843 	 * timestamp variables in the nexus advisory region.
4844 	 */
4845 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4846 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4847 		fg_ts = &nxadv->nxadv_fg_sendts;
4848 		rt_ts = &nxadv->nxadv_rt_sendts;
4849 	}
4850 #endif /* SKYWALK */
4851 
4852 	/*
4853 	 * If packet already carries a timestamp, either from dlil_output()
4854 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4855 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4856 	 * the timestamp value is used internally there.
4857 	 */
4858 	switch (p->cp_ptype) {
4859 	case QP_MBUF:
4860 #if SKYWALK
4861 		/*
4862 		 * Valid only for non-native (compat) Skywalk interface.
4863 		 * If the data source uses packet, caller must convert
4864 		 * it to mbuf first prior to calling this routine.
4865 		 */
4866 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4867 #endif /* SKYWALK */
4868 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4869 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4870 
4871 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4872 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4873 			nanouptime(&now);
4874 			net_timernsec(&now, &now_nsec);
4875 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4876 		}
4877 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4878 		/*
4879 		 * If the packet service class is not background,
4880 		 * update the timestamp to indicate recent activity
4881 		 * on a foreground socket.
4882 		 */
4883 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4884 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4885 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4886 			    PKTF_SO_BACKGROUND)) {
4887 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
4888 				if (fg_ts != NULL) {
4889 					*fg_ts = (uint32_t)_net_uptime;
4890 				}
4891 			}
4892 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4893 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
4894 				if (rt_ts != NULL) {
4895 					*rt_ts = (uint32_t)_net_uptime;
4896 				}
4897 			}
4898 		}
4899 		pktlen = m_pktlen(p->cp_mbuf);
4900 
4901 		/*
4902 		 * Some Wi-Fi AP implementations do not correctly handle
4903 		 * multicast IP packets with DSCP bits set (radr://9331522).
4904 		 * As a workaround we clear the DSCP bits but keep service
4905 		 * class (rdar://51507725).
4906 		 */
4907 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
4908 		    IFNET_IS_WIFI_INFRA(ifp)) {
4909 			size_t len = mbuf_len(p->cp_mbuf), hlen;
4910 			struct ether_header *eh;
4911 			boolean_t pullup = FALSE;
4912 			uint16_t etype;
4913 
4914 			if (__improbable(len < sizeof(struct ether_header))) {
4915 				DTRACE_IP1(small__ether, size_t, len);
4916 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
4917 				    sizeof(struct ether_header))) == NULL) {
4918 					return ENOMEM;
4919 				}
4920 			}
4921 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
4922 			etype = ntohs(eh->ether_type);
4923 			if (etype == ETHERTYPE_IP) {
4924 				hlen = sizeof(struct ether_header) +
4925 				    sizeof(struct ip);
4926 				if (len < hlen) {
4927 					DTRACE_IP1(small__v4, size_t, len);
4928 					pullup = TRUE;
4929 				}
4930 				ip_ver = IPVERSION;
4931 			} else if (etype == ETHERTYPE_IPV6) {
4932 				hlen = sizeof(struct ether_header) +
4933 				    sizeof(struct ip6_hdr);
4934 				if (len < hlen) {
4935 					DTRACE_IP1(small__v6, size_t, len);
4936 					pullup = TRUE;
4937 				}
4938 				ip_ver = IPV6_VERSION;
4939 			} else {
4940 				DTRACE_IP1(invalid__etype, uint16_t, etype);
4941 				break;
4942 			}
4943 			if (pullup) {
4944 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
4945 				    NULL) {
4946 					return ENOMEM;
4947 				}
4948 
4949 				eh = (struct ether_header *)mbuf_data(
4950 					p->cp_mbuf);
4951 			}
4952 			mcast_buf = (uint8_t *)(eh + 1);
4953 			/*
4954 			 * ifnet_mcast_clear_dscp() will finish the work below.
4955 			 * Note that the pullups above ensure that mcast_buf
4956 			 * points to a full IP header.
4957 			 */
4958 		}
4959 		break;
4960 
4961 #if SKYWALK
4962 	case QP_PACKET:
4963 		/*
4964 		 * Valid only for native Skywalk interface.  If the data
4965 		 * source uses mbuf, caller must convert it to packet first
4966 		 * prior to calling this routine.
4967 		 */
4968 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
4969 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
4970 		    p->cp_kpkt->pkt_timestamp == 0) {
4971 			nanouptime(&now);
4972 			net_timernsec(&now, &now_nsec);
4973 			p->cp_kpkt->pkt_timestamp = now_nsec;
4974 		}
4975 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
4976 		/*
4977 		 * If the packet service class is not background,
4978 		 * update the timestamps on the interface, as well as
4979 		 * the ones in nexus-wide advisory to indicate recent
4980 		 * activity on a foreground flow.
4981 		 */
4982 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
4983 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
4984 			if (fg_ts != NULL) {
4985 				*fg_ts = (uint32_t)_net_uptime;
4986 			}
4987 		}
4988 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
4989 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
4990 			if (rt_ts != NULL) {
4991 				*rt_ts = (uint32_t)_net_uptime;
4992 			}
4993 		}
4994 		pktlen = p->cp_kpkt->pkt_length;
4995 
4996 		/*
4997 		 * Some Wi-Fi AP implementations do not correctly handle
4998 		 * multicast IP packets with DSCP bits set (radr://9331522).
4999 		 * As a workaround we clear the DSCP bits but keep service
5000 		 * class (rdar://51507725).
5001 		 */
5002 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5003 		    IFNET_IS_WIFI_INFRA(ifp)) {
5004 			uint8_t *baddr;
5005 			struct ether_header *eh;
5006 			uint16_t etype;
5007 
5008 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5009 			baddr += p->cp_kpkt->pkt_headroom;
5010 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5011 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5012 				    p->cp_kpkt);
5013 				break;
5014 			}
5015 			eh = (struct ether_header *)(void *)baddr;
5016 			etype = ntohs(eh->ether_type);
5017 			if (etype == ETHERTYPE_IP) {
5018 				if (pktlen < sizeof(struct ether_header) +
5019 				    sizeof(struct ip)) {
5020 					DTRACE_IP1(pkt__small__v4, uint32_t,
5021 					    pktlen);
5022 					break;
5023 				}
5024 				ip_ver = IPVERSION;
5025 			} else if (etype == ETHERTYPE_IPV6) {
5026 				if (pktlen < sizeof(struct ether_header) +
5027 				    sizeof(struct ip6_hdr)) {
5028 					DTRACE_IP1(pkt__small__v6, uint32_t,
5029 					    pktlen);
5030 					break;
5031 				}
5032 				ip_ver = IPV6_VERSION;
5033 			} else {
5034 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5035 				    etype);
5036 				break;
5037 			}
5038 			mcast_buf = (uint8_t *)(eh + 1);
5039 			/*
5040 			 * ifnet_mcast_clear_dscp() will finish the work below.
5041 			 * The checks above verify that the IP header is in the
5042 			 * first buflet.
5043 			 */
5044 		}
5045 		break;
5046 #endif /* SKYWALK */
5047 
5048 	default:
5049 		VERIFY(0);
5050 		/* NOTREACHED */
5051 		__builtin_unreachable();
5052 	}
5053 
5054 	if (mcast_buf != NULL) {
5055 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5056 	}
5057 
5058 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5059 		if (now_nsec == 0) {
5060 			nanouptime(&now);
5061 			net_timernsec(&now, &now_nsec);
5062 		}
5063 		/*
5064 		 * If the driver chose to delay start callback for
5065 		 * coalescing multiple packets, Then use the following
5066 		 * heuristics to make sure that start callback will
5067 		 * be delayed only when bulk data transfer is detected.
5068 		 * 1. number of packets enqueued in (delay_win * 2) is
5069 		 * greater than or equal to the delay qlen.
5070 		 * 2. If delay_start is enabled it will stay enabled for
5071 		 * another 10 idle windows. This is to take into account
5072 		 * variable RTT and burst traffic.
5073 		 * 3. If the time elapsed since last enqueue is more
5074 		 * than 200ms we disable delaying start callback. This is
5075 		 * is to take idle time into account.
5076 		 */
5077 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5078 		if (ifp->if_start_delay_swin > 0) {
5079 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5080 				ifp->if_start_delay_cnt++;
5081 			} else if ((now_nsec - ifp->if_start_delay_swin)
5082 			    >= (200 * 1000 * 1000)) {
5083 				ifp->if_start_delay_swin = now_nsec;
5084 				ifp->if_start_delay_cnt = 1;
5085 				ifp->if_start_delay_idle = 0;
5086 				if (ifp->if_eflags & IFEF_DELAY_START) {
5087 					if_clear_eflags(ifp, IFEF_DELAY_START);
5088 					ifnet_delay_start_disabled_increment();
5089 				}
5090 			} else {
5091 				if (ifp->if_start_delay_cnt >=
5092 				    ifp->if_start_delay_qlen) {
5093 					if_set_eflags(ifp, IFEF_DELAY_START);
5094 					ifp->if_start_delay_idle = 0;
5095 				} else {
5096 					if (ifp->if_start_delay_idle >= 10) {
5097 						if_clear_eflags(ifp,
5098 						    IFEF_DELAY_START);
5099 						ifnet_delay_start_disabled_increment();
5100 					} else {
5101 						ifp->if_start_delay_idle++;
5102 					}
5103 				}
5104 				ifp->if_start_delay_swin = now_nsec;
5105 				ifp->if_start_delay_cnt = 1;
5106 			}
5107 		} else {
5108 			ifp->if_start_delay_swin = now_nsec;
5109 			ifp->if_start_delay_cnt = 1;
5110 			ifp->if_start_delay_idle = 0;
5111 			if_clear_eflags(ifp, IFEF_DELAY_START);
5112 		}
5113 	} else {
5114 		if_clear_eflags(ifp, IFEF_DELAY_START);
5115 	}
5116 
5117 	/* enqueue the packet (caller consumes object) */
5118 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5119 	    1, pktlen, pdrop);
5120 
5121 	/*
5122 	 * Tell the driver to start dequeueing; do this even when the queue
5123 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5124 	 * be dequeueing from other unsuspended queues.
5125 	 */
5126 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5127 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5128 		ifnet_start(ifp);
5129 	}
5130 
5131 	return error;
5132 }
5133 
5134 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5135 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, classq_pkt_t *head,
5136     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5137     boolean_t *pdrop)
5138 {
5139 	int error;
5140 
5141 	/* enqueue the packet (caller consumes object) */
5142 	error = ifclassq_enqueue(ifp->if_snd, head, tail, cnt, bytes, pdrop);
5143 
5144 	/*
5145 	 * Tell the driver to start dequeueing; do this even when the queue
5146 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5147 	 * be dequeueing from other unsuspended queues.
5148 	 */
5149 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5150 		ifnet_start(ifp);
5151 	}
5152 	return error;
5153 }
5154 
5155 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5156 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5157 {
5158 	struct ifnet *ifp = handle;
5159 	boolean_t pdrop;        /* dummy */
5160 	uint32_t i;
5161 
5162 	ASSERT(n_pkts >= 1);
5163 	for (i = 0; i < n_pkts - 1; i++) {
5164 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5165 		    FALSE, &pdrop);
5166 	}
5167 	/* flush with the last packet */
5168 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5169 	    TRUE, &pdrop);
5170 
5171 	return 0;
5172 }
5173 
5174 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5175 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5176     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5177 {
5178 	if (ifp->if_output_netem != NULL) {
5179 		return netem_enqueue(ifp->if_output_netem, pkt, pdrop);
5180 	} else {
5181 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5182 	}
5183 }
5184 
5185 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5186 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5187 {
5188 	boolean_t pdrop;
5189 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5190 }
5191 
5192 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5193 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5194     boolean_t *pdrop)
5195 {
5196 	classq_pkt_t pkt;
5197 
5198 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5199 	    m->m_nextpkt != NULL) {
5200 		if (m != NULL) {
5201 			m_freem_list(m);
5202 			*pdrop = TRUE;
5203 		}
5204 		return EINVAL;
5205 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5206 	    !IF_FULLY_ATTACHED(ifp)) {
5207 		/* flag tested without lock for performance */
5208 		m_freem(m);
5209 		*pdrop = TRUE;
5210 		return ENXIO;
5211 	} else if (!(ifp->if_flags & IFF_UP)) {
5212 		m_freem(m);
5213 		*pdrop = TRUE;
5214 		return ENETDOWN;
5215 	}
5216 
5217 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5218 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5219 }
5220 
5221 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5222 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5223     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5224     boolean_t *pdrop)
5225 {
5226 	classq_pkt_t head, tail;
5227 
5228 	ASSERT(m_head != NULL);
5229 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5230 	ASSERT(m_tail != NULL);
5231 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5232 	ASSERT(ifp != NULL);
5233 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5234 
5235 	if (!IF_FULLY_ATTACHED(ifp)) {
5236 		/* flag tested without lock for performance */
5237 		m_freem_list(m_head);
5238 		*pdrop = TRUE;
5239 		return ENXIO;
5240 	} else if (!(ifp->if_flags & IFF_UP)) {
5241 		m_freem_list(m_head);
5242 		*pdrop = TRUE;
5243 		return ENETDOWN;
5244 	}
5245 
5246 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5247 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5248 	return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5249 	           flush, pdrop);
5250 }
5251 
5252 #if SKYWALK
5253 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5254 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5255     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5256 {
5257 	classq_pkt_t pkt;
5258 
5259 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5260 
5261 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5262 		if (kpkt != NULL) {
5263 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5264 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5265 			*pdrop = TRUE;
5266 		}
5267 		return EINVAL;
5268 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5269 	    !IF_FULLY_ATTACHED(ifp))) {
5270 		/* flag tested without lock for performance */
5271 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5272 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5273 		*pdrop = TRUE;
5274 		return ENXIO;
5275 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5276 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5277 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5278 		*pdrop = TRUE;
5279 		return ENETDOWN;
5280 	}
5281 
5282 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5283 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5284 }
5285 
5286 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5287 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5288     boolean_t flush, boolean_t *pdrop)
5289 {
5290 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5291 }
5292 
5293 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5294 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5295     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5296 {
5297 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5298 }
5299 
5300 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5301 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5302     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5303     boolean_t *pdrop)
5304 {
5305 	classq_pkt_t head, tail;
5306 
5307 	ASSERT(k_head != NULL);
5308 	ASSERT(k_tail != NULL);
5309 	ASSERT(ifp != NULL);
5310 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5311 
5312 	if (!IF_FULLY_ATTACHED(ifp)) {
5313 		/* flag tested without lock for performance */
5314 		pp_free_packet_chain(k_head, NULL);
5315 		*pdrop = TRUE;
5316 		return ENXIO;
5317 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5318 		pp_free_packet_chain(k_head, NULL);
5319 		*pdrop = TRUE;
5320 		return ENETDOWN;
5321 	}
5322 
5323 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5324 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5325 	return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5326 	           flush, pdrop);
5327 }
5328 #endif /* SKYWALK */
5329 
5330 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5331 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5332 {
5333 	errno_t rc;
5334 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5335 
5336 	if (ifp == NULL || mp == NULL) {
5337 		return EINVAL;
5338 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5339 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5340 		return ENXIO;
5341 	}
5342 	if (!ifnet_is_attached(ifp, 1)) {
5343 		return ENXIO;
5344 	}
5345 
5346 #if SKYWALK
5347 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5348 #endif /* SKYWALK */
5349 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5350 	    &pkt, NULL, NULL, NULL);
5351 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5352 	ifnet_decr_iorefcnt(ifp);
5353 	*mp = pkt.cp_mbuf;
5354 	return rc;
5355 }
5356 
5357 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5358 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5359     struct mbuf **mp)
5360 {
5361 	errno_t rc;
5362 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5363 
5364 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5365 		return EINVAL;
5366 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5367 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5368 		return ENXIO;
5369 	}
5370 	if (!ifnet_is_attached(ifp, 1)) {
5371 		return ENXIO;
5372 	}
5373 
5374 #if SKYWALK
5375 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5376 #endif /* SKYWALK */
5377 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5378 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL);
5379 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5380 	ifnet_decr_iorefcnt(ifp);
5381 	*mp = pkt.cp_mbuf;
5382 	return rc;
5383 }
5384 
5385 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5386 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5387     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5388 {
5389 	errno_t rc;
5390 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5391 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5392 
5393 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5394 		return EINVAL;
5395 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5396 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5397 		return ENXIO;
5398 	}
5399 	if (!ifnet_is_attached(ifp, 1)) {
5400 		return ENXIO;
5401 	}
5402 
5403 #if SKYWALK
5404 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5405 #endif /* SKYWALK */
5406 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5407 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len);
5408 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5409 	ifnet_decr_iorefcnt(ifp);
5410 	*head = pkt_head.cp_mbuf;
5411 	if (tail != NULL) {
5412 		*tail = pkt_tail.cp_mbuf;
5413 	}
5414 	return rc;
5415 }
5416 
5417 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5418 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5419     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5420 {
5421 	errno_t rc;
5422 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5423 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5424 
5425 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5426 		return EINVAL;
5427 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5428 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5429 		return ENXIO;
5430 	}
5431 	if (!ifnet_is_attached(ifp, 1)) {
5432 		return ENXIO;
5433 	}
5434 
5435 #if SKYWALK
5436 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5437 #endif /* SKYWALK */
5438 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5439 	    byte_limit, &pkt_head, &pkt_tail, cnt, len);
5440 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5441 	ifnet_decr_iorefcnt(ifp);
5442 	*head = pkt_head.cp_mbuf;
5443 	if (tail != NULL) {
5444 		*tail = pkt_tail.cp_mbuf;
5445 	}
5446 	return rc;
5447 }
5448 
5449 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5450 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5451     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5452     u_int32_t *len)
5453 {
5454 	errno_t rc;
5455 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5456 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5457 
5458 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5459 	    !MBUF_VALID_SC(sc)) {
5460 		return EINVAL;
5461 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5462 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5463 		return ENXIO;
5464 	}
5465 	if (!ifnet_is_attached(ifp, 1)) {
5466 		return ENXIO;
5467 	}
5468 
5469 #if SKYWALK
5470 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5471 #endif /* SKYWALK */
5472 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5473 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5474 	    cnt, len);
5475 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5476 	ifnet_decr_iorefcnt(ifp);
5477 	*head = pkt_head.cp_mbuf;
5478 	if (tail != NULL) {
5479 		*tail = pkt_tail.cp_mbuf;
5480 	}
5481 	return rc;
5482 }
5483 
5484 #if XNU_TARGET_OS_OSX
5485 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5486 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5487     const struct sockaddr *dest, const char *dest_linkaddr,
5488     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5489 {
5490 	if (pre != NULL) {
5491 		*pre = 0;
5492 	}
5493 	if (post != NULL) {
5494 		*post = 0;
5495 	}
5496 
5497 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5498 }
5499 #endif /* XNU_TARGET_OS_OSX */
5500 
5501 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5502 packet_has_vlan_tag(struct mbuf * m)
5503 {
5504 	u_int   tag = 0;
5505 
5506 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5507 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5508 		if (tag == 0) {
5509 			/* the packet is just priority-tagged, clear the bit */
5510 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5511 		}
5512 	}
5513 	return tag != 0;
5514 }
5515 
5516 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5517 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5518     char **frame_header_p, protocol_family_t protocol_family)
5519 {
5520 	boolean_t               is_vlan_packet = FALSE;
5521 	struct ifnet_filter     *filter;
5522 	struct mbuf             *m = *m_p;
5523 
5524 	is_vlan_packet = packet_has_vlan_tag(m);
5525 
5526 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5527 		return 0;
5528 	}
5529 
5530 	/*
5531 	 * Pass the inbound packet to the interface filters
5532 	 */
5533 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5534 	/* prevent filter list from changing in case we drop the lock */
5535 	if_flt_monitor_busy(ifp);
5536 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5537 		int result;
5538 
5539 		/* exclude VLAN packets from external filters PR-3586856 */
5540 		if (is_vlan_packet &&
5541 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5542 			continue;
5543 		}
5544 
5545 		if (!filter->filt_skip && filter->filt_input != NULL &&
5546 		    (filter->filt_protocol == 0 ||
5547 		    filter->filt_protocol == protocol_family)) {
5548 			lck_mtx_unlock(&ifp->if_flt_lock);
5549 
5550 			result = (*filter->filt_input)(filter->filt_cookie,
5551 			    ifp, protocol_family, m_p, frame_header_p);
5552 
5553 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5554 			if (result != 0) {
5555 				/* we're done with the filter list */
5556 				if_flt_monitor_unbusy(ifp);
5557 				lck_mtx_unlock(&ifp->if_flt_lock);
5558 				return result;
5559 			}
5560 		}
5561 	}
5562 	/* we're done with the filter list */
5563 	if_flt_monitor_unbusy(ifp);
5564 	lck_mtx_unlock(&ifp->if_flt_lock);
5565 
5566 	/*
5567 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5568 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5569 	 */
5570 	if (*m_p != NULL) {
5571 		(*m_p)->m_flags &= ~M_PROTO1;
5572 	}
5573 
5574 	return 0;
5575 }
5576 
5577 __attribute__((noinline))
5578 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5579 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5580     protocol_family_t protocol_family)
5581 {
5582 	boolean_t               is_vlan_packet;
5583 	struct ifnet_filter     *filter;
5584 	struct mbuf             *m = *m_p;
5585 
5586 	is_vlan_packet = packet_has_vlan_tag(m);
5587 
5588 	/*
5589 	 * Pass the outbound packet to the interface filters
5590 	 */
5591 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5592 	/* prevent filter list from changing in case we drop the lock */
5593 	if_flt_monitor_busy(ifp);
5594 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5595 		int result;
5596 
5597 		/* exclude VLAN packets from external filters PR-3586856 */
5598 		if (is_vlan_packet &&
5599 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5600 			continue;
5601 		}
5602 
5603 		if (!filter->filt_skip && filter->filt_output != NULL &&
5604 		    (filter->filt_protocol == 0 ||
5605 		    filter->filt_protocol == protocol_family)) {
5606 			lck_mtx_unlock(&ifp->if_flt_lock);
5607 
5608 			result = filter->filt_output(filter->filt_cookie, ifp,
5609 			    protocol_family, m_p);
5610 
5611 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5612 			if (result != 0) {
5613 				/* we're done with the filter list */
5614 				if_flt_monitor_unbusy(ifp);
5615 				lck_mtx_unlock(&ifp->if_flt_lock);
5616 				return result;
5617 			}
5618 		}
5619 	}
5620 	/* we're done with the filter list */
5621 	if_flt_monitor_unbusy(ifp);
5622 	lck_mtx_unlock(&ifp->if_flt_lock);
5623 
5624 	return 0;
5625 }
5626 
5627 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5628 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5629 {
5630 	int error;
5631 
5632 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5633 		/* Version 1 protocols get one packet at a time */
5634 		while (m != NULL) {
5635 			char *  frame_header;
5636 			mbuf_t  next_packet;
5637 
5638 			next_packet = m->m_nextpkt;
5639 			m->m_nextpkt = NULL;
5640 			frame_header = m->m_pkthdr.pkt_hdr;
5641 			m->m_pkthdr.pkt_hdr = NULL;
5642 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5643 			    ifproto->protocol_family, m, frame_header);
5644 			if (error != 0 && error != EJUSTRETURN) {
5645 				m_freem(m);
5646 			}
5647 			m = next_packet;
5648 		}
5649 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5650 		/* Version 2 protocols support packet lists */
5651 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5652 		    ifproto->protocol_family, m);
5653 		if (error != 0 && error != EJUSTRETURN) {
5654 			m_freem_list(m);
5655 		}
5656 	}
5657 }
5658 
5659 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5660 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5661     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5662 {
5663 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5664 
5665 	if (s->packets_in != 0) {
5666 		d->packets_in += s->packets_in;
5667 	}
5668 	if (s->bytes_in != 0) {
5669 		d->bytes_in += s->bytes_in;
5670 	}
5671 	if (s->errors_in != 0) {
5672 		d->errors_in += s->errors_in;
5673 	}
5674 
5675 	if (s->packets_out != 0) {
5676 		d->packets_out += s->packets_out;
5677 	}
5678 	if (s->bytes_out != 0) {
5679 		d->bytes_out += s->bytes_out;
5680 	}
5681 	if (s->errors_out != 0) {
5682 		d->errors_out += s->errors_out;
5683 	}
5684 
5685 	if (s->collisions != 0) {
5686 		d->collisions += s->collisions;
5687 	}
5688 	if (s->dropped != 0) {
5689 		d->dropped += s->dropped;
5690 	}
5691 
5692 	if (poll) {
5693 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5694 	}
5695 }
5696 
5697 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5698 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5699 {
5700 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5701 
5702 	/*
5703 	 * Use of atomic operations is unavoidable here because
5704 	 * these stats may also be incremented elsewhere via KPIs.
5705 	 */
5706 	if (s->packets_in != 0) {
5707 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5708 		s->packets_in = 0;
5709 	}
5710 	if (s->bytes_in != 0) {
5711 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5712 		s->bytes_in = 0;
5713 	}
5714 	if (s->errors_in != 0) {
5715 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5716 		s->errors_in = 0;
5717 	}
5718 
5719 	if (s->packets_out != 0) {
5720 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5721 		s->packets_out = 0;
5722 	}
5723 	if (s->bytes_out != 0) {
5724 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5725 		s->bytes_out = 0;
5726 	}
5727 	if (s->errors_out != 0) {
5728 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5729 		s->errors_out = 0;
5730 	}
5731 
5732 	if (s->collisions != 0) {
5733 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
5734 		s->collisions = 0;
5735 	}
5736 	if (s->dropped != 0) {
5737 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
5738 		s->dropped = 0;
5739 	}
5740 
5741 	/*
5742 	 * No need for atomic operations as they are modified here
5743 	 * only from within the DLIL input thread context.
5744 	 */
5745 	if (ifp->if_poll_tstats.packets != 0) {
5746 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5747 		ifp->if_poll_tstats.packets = 0;
5748 	}
5749 	if (ifp->if_poll_tstats.bytes != 0) {
5750 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5751 		ifp->if_poll_tstats.bytes = 0;
5752 	}
5753 
5754 	return ifp->if_data_threshold != 0;
5755 }
5756 
5757 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5758 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5759 {
5760 	return dlil_input_packet_list_common(ifp, m, 0,
5761 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5762 }
5763 
5764 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5765 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5766     u_int32_t cnt, ifnet_model_t mode)
5767 {
5768 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5769 }
5770 
5771 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5772 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5773     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5774 {
5775 	int error = 0;
5776 	protocol_family_t protocol_family;
5777 	mbuf_t next_packet;
5778 	ifnet_t ifp = ifp_param;
5779 	char *frame_header = NULL;
5780 	struct if_proto *last_ifproto = NULL;
5781 	mbuf_t pkt_first = NULL;
5782 	mbuf_t *pkt_next = NULL;
5783 	u_int32_t poll_thresh = 0, poll_ival = 0;
5784 	int iorefcnt = 0;
5785 
5786 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5787 
5788 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5789 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
5790 		poll_thresh = cnt;
5791 	}
5792 
5793 	while (m != NULL) {
5794 		struct if_proto *ifproto = NULL;
5795 		uint32_t pktf_mask;     /* pkt flags to preserve */
5796 
5797 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5798 
5799 		if (ifp_param == NULL) {
5800 			ifp = m->m_pkthdr.rcvif;
5801 		}
5802 
5803 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
5804 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5805 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5806 			ifnet_poll(ifp);
5807 		}
5808 
5809 		/* Check if this mbuf looks valid */
5810 		MBUF_INPUT_CHECK(m, ifp);
5811 
5812 		next_packet = m->m_nextpkt;
5813 		m->m_nextpkt = NULL;
5814 		frame_header = m->m_pkthdr.pkt_hdr;
5815 		m->m_pkthdr.pkt_hdr = NULL;
5816 
5817 		/*
5818 		 * Get an IO reference count if the interface is not
5819 		 * loopback (lo0) and it is attached; lo0 never goes
5820 		 * away, so optimize for that.
5821 		 */
5822 		if (ifp != lo_ifp) {
5823 			/* iorefcnt is 0 if it hasn't been taken yet */
5824 			if (iorefcnt == 0) {
5825 				if (!ifnet_datamov_begin(ifp)) {
5826 					m_freem(m);
5827 					goto next;
5828 				}
5829 			}
5830 			iorefcnt = 1;
5831 			/*
5832 			 * Preserve the time stamp and skip pktap flags.
5833 			 */
5834 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5835 		} else {
5836 			/*
5837 			 * If this arrived on lo0, preserve interface addr
5838 			 * info to allow for connectivity between loopback
5839 			 * and local interface addresses.
5840 			 */
5841 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5842 		}
5843 		pktf_mask |= PKTF_WAKE_PKT;
5844 
5845 		/* make sure packet comes in clean */
5846 		m_classifier_init(m, pktf_mask);
5847 
5848 		ifp_inc_traffic_class_in(ifp, m);
5849 
5850 		/* find which protocol family this packet is for */
5851 		ifnet_lock_shared(ifp);
5852 		error = (*ifp->if_demux)(ifp, m, frame_header,
5853 		    &protocol_family);
5854 		ifnet_lock_done(ifp);
5855 		if (error != 0) {
5856 			if (error == EJUSTRETURN) {
5857 				goto next;
5858 			}
5859 			protocol_family = 0;
5860 		}
5861 
5862 #if (DEVELOPMENT || DEBUG)
5863 		/*
5864 		 * For testing we do not care about broadcast and multicast packets as
5865 		 * they are not as controllable as unicast traffic
5866 		 */
5867 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
5868 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
5869 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
5870 				/*
5871 				 * This is a one-shot command
5872 				 */
5873 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
5874 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
5875 			}
5876 		}
5877 #endif /* (DEVELOPMENT || DEBUG) */
5878 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
5879 			char buffer[64];
5880 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
5881 
5882 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
5883 			    ifp->if_xname, m_pktlen(m));
5884 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
5885 				log_hexdump(buffer, buflen);
5886 			}
5887 		}
5888 
5889 		pktap_input(ifp, protocol_family, m, frame_header);
5890 
5891 		/* Drop v4 packets received on CLAT46 enabled cell interface */
5892 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
5893 		    ifp->if_type == IFT_CELLULAR) {
5894 			m_freem(m);
5895 			ip6stat.ip6s_clat464_in_v4_drop++;
5896 			goto next;
5897 		}
5898 
5899 		/* Translate the packet if it is received on CLAT interface */
5900 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
5901 		    && dlil_is_clat_needed(protocol_family, m)) {
5902 			char *data = NULL;
5903 			struct ether_header eh;
5904 			struct ether_header *ehp = NULL;
5905 
5906 			if (ifp->if_type == IFT_ETHER) {
5907 				ehp = (struct ether_header *)(void *)frame_header;
5908 				/* Skip RX Ethernet packets if they are not IPV6 */
5909 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
5910 					goto skip_clat;
5911 				}
5912 
5913 				/* Keep a copy of frame_header for Ethernet packets */
5914 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
5915 			}
5916 			error = dlil_clat64(ifp, &protocol_family, &m);
5917 			data = (char *) mbuf_data(m);
5918 			if (error != 0) {
5919 				m_freem(m);
5920 				ip6stat.ip6s_clat464_in_drop++;
5921 				goto next;
5922 			}
5923 			/* Native v6 should be No-op */
5924 			if (protocol_family != PF_INET) {
5925 				goto skip_clat;
5926 			}
5927 
5928 			/* Do this only for translated v4 packets. */
5929 			switch (ifp->if_type) {
5930 			case IFT_CELLULAR:
5931 				frame_header = data;
5932 				break;
5933 			case IFT_ETHER:
5934 				/*
5935 				 * Drop if the mbuf doesn't have enough
5936 				 * space for Ethernet header
5937 				 */
5938 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
5939 					m_free(m);
5940 					ip6stat.ip6s_clat464_in_drop++;
5941 					goto next;
5942 				}
5943 				/*
5944 				 * Set the frame_header ETHER_HDR_LEN bytes
5945 				 * preceeding the data pointer. Change
5946 				 * the ether_type too.
5947 				 */
5948 				frame_header = data - ETHER_HDR_LEN;
5949 				eh.ether_type = htons(ETHERTYPE_IP);
5950 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
5951 				break;
5952 			}
5953 		}
5954 skip_clat:
5955 		/*
5956 		 * Match the wake packet against the list of ports that has been
5957 		 * been queried by the driver before the device went to sleep
5958 		 */
5959 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
5960 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
5961 				if_ports_used_match_mbuf(ifp, protocol_family, m);
5962 			}
5963 		}
5964 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
5965 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5966 			dlil_input_cksum_dbg(ifp, m, frame_header,
5967 			    protocol_family);
5968 		}
5969 		/*
5970 		 * For partial checksum offload, we expect the driver to
5971 		 * set the start offset indicating the start of the span
5972 		 * that is covered by the hardware-computed checksum;
5973 		 * adjust this start offset accordingly because the data
5974 		 * pointer has been advanced beyond the link-layer header.
5975 		 *
5976 		 * Virtual lan types (bridge, vlan, bond) can call
5977 		 * dlil_input_packet_list() with the same packet with the
5978 		 * checksum flags set. Set a flag indicating that the
5979 		 * adjustment has already been done.
5980 		 */
5981 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
5982 			/* adjustment has already been done */
5983 		} else if ((m->m_pkthdr.csum_flags &
5984 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
5985 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5986 			int adj;
5987 			if (frame_header == NULL ||
5988 			    frame_header < (char *)mbuf_datastart(m) ||
5989 			    frame_header > (char *)m->m_data ||
5990 			    (adj = (int)(m->m_data - frame_header)) >
5991 			    m->m_pkthdr.csum_rx_start) {
5992 				m->m_pkthdr.csum_data = 0;
5993 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
5994 				hwcksum_in_invalidated++;
5995 			} else {
5996 				m->m_pkthdr.csum_rx_start -= adj;
5997 			}
5998 			/* make sure we don't adjust more than once */
5999 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6000 		}
6001 		if (clat_debug) {
6002 			pktap_input(ifp, protocol_family, m, frame_header);
6003 		}
6004 
6005 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6006 			atomic_add_64(&ifp->if_imcasts, 1);
6007 		}
6008 
6009 		/* run interface filters */
6010 		error = dlil_interface_filters_input(ifp, &m,
6011 		    &frame_header, protocol_family);
6012 		if (error != 0) {
6013 			if (error != EJUSTRETURN) {
6014 				m_freem(m);
6015 			}
6016 			goto next;
6017 		}
6018 		/*
6019 		 * A VLAN interface receives VLAN-tagged packets by attaching
6020 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6021 		 * interface is a member of a bridge, the parent interface
6022 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6023 		 * M_PROMISC packet must be processed by the VLAN protocol
6024 		 * so that it can be sent up the stack via
6025 		 * dlil_input_packet_list(). That allows the bridge interface's
6026 		 * input filter, attached to the VLAN interface, to process
6027 		 * the packet.
6028 		 */
6029 		if (protocol_family != PF_VLAN &&
6030 		    (m->m_flags & M_PROMISC) != 0) {
6031 			m_freem(m);
6032 			goto next;
6033 		}
6034 
6035 		/* Lookup the protocol attachment to this interface */
6036 		if (protocol_family == 0) {
6037 			ifproto = NULL;
6038 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6039 		    (last_ifproto->protocol_family == protocol_family)) {
6040 			VERIFY(ifproto == NULL);
6041 			ifproto = last_ifproto;
6042 			if_proto_ref(last_ifproto);
6043 		} else {
6044 			VERIFY(ifproto == NULL);
6045 			ifnet_lock_shared(ifp);
6046 			/* callee holds a proto refcnt upon success */
6047 			ifproto = find_attached_proto(ifp, protocol_family);
6048 			ifnet_lock_done(ifp);
6049 		}
6050 		if (ifproto == NULL) {
6051 			/* no protocol for this packet, discard */
6052 			m_freem(m);
6053 			goto next;
6054 		}
6055 		if (ifproto != last_ifproto) {
6056 			if (last_ifproto != NULL) {
6057 				/* pass up the list for the previous protocol */
6058 				dlil_ifproto_input(last_ifproto, pkt_first);
6059 				pkt_first = NULL;
6060 				if_proto_free(last_ifproto);
6061 			}
6062 			last_ifproto = ifproto;
6063 			if_proto_ref(ifproto);
6064 		}
6065 		/* extend the list */
6066 		m->m_pkthdr.pkt_hdr = frame_header;
6067 		if (pkt_first == NULL) {
6068 			pkt_first = m;
6069 		} else {
6070 			*pkt_next = m;
6071 		}
6072 		pkt_next = &m->m_nextpkt;
6073 
6074 next:
6075 		if (next_packet == NULL && last_ifproto != NULL) {
6076 			/* pass up the last list of packets */
6077 			dlil_ifproto_input(last_ifproto, pkt_first);
6078 			if_proto_free(last_ifproto);
6079 			last_ifproto = NULL;
6080 		}
6081 		if (ifproto != NULL) {
6082 			if_proto_free(ifproto);
6083 			ifproto = NULL;
6084 		}
6085 
6086 		m = next_packet;
6087 
6088 		/* update the driver's multicast filter, if needed */
6089 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6090 			ifp->if_updatemcasts = 0;
6091 		}
6092 		if (iorefcnt == 1) {
6093 			/* If the next mbuf is on a different interface, unlock data-mov */
6094 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6095 				ifnet_datamov_end(ifp);
6096 				iorefcnt = 0;
6097 			}
6098 		}
6099 	}
6100 
6101 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6102 }
6103 
6104 errno_t
if_mcasts_update(struct ifnet * ifp)6105 if_mcasts_update(struct ifnet *ifp)
6106 {
6107 	errno_t err;
6108 
6109 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6110 	if (err == EAFNOSUPPORT) {
6111 		err = 0;
6112 	}
6113 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6114 	    "(err=%d)\n", if_name(ifp),
6115 	    (err == 0 ? "successfully restored" : "failed to restore"),
6116 	    ifp->if_updatemcasts, err);
6117 
6118 	/* just return success */
6119 	return 0;
6120 }
6121 
6122 /* If ifp is set, we will increment the generation for the interface */
6123 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6124 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6125 {
6126 	if (ifp != NULL) {
6127 		ifnet_increment_generation(ifp);
6128 	}
6129 
6130 #if NECP
6131 	necp_update_all_clients();
6132 #endif /* NECP */
6133 
6134 	return kev_post_msg(event);
6135 }
6136 
6137 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6138 dlil_post_sifflags_msg(struct ifnet * ifp)
6139 {
6140 	struct kev_msg ev_msg;
6141 	struct net_event_data ev_data;
6142 
6143 	bzero(&ev_data, sizeof(ev_data));
6144 	bzero(&ev_msg, sizeof(ev_msg));
6145 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6146 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6147 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6148 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6149 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6150 	ev_data.if_family = ifp->if_family;
6151 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6152 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6153 	ev_msg.dv[0].data_ptr = &ev_data;
6154 	ev_msg.dv[1].data_length = 0;
6155 	dlil_post_complete_msg(ifp, &ev_msg);
6156 }
6157 
6158 #define TMP_IF_PROTO_ARR_SIZE   10
6159 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6160 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6161 {
6162 	struct ifnet_filter *filter = NULL;
6163 	struct if_proto *proto = NULL;
6164 	int if_proto_count = 0;
6165 	struct if_proto **tmp_ifproto_arr = NULL;
6166 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6167 	int tmp_ifproto_arr_idx = 0;
6168 	bool tmp_malloc = false;
6169 
6170 	/*
6171 	 * Pass the event to the interface filters
6172 	 */
6173 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6174 	/* prevent filter list from changing in case we drop the lock */
6175 	if_flt_monitor_busy(ifp);
6176 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6177 		if (filter->filt_event != NULL) {
6178 			lck_mtx_unlock(&ifp->if_flt_lock);
6179 
6180 			filter->filt_event(filter->filt_cookie, ifp,
6181 			    filter->filt_protocol, event);
6182 
6183 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6184 		}
6185 	}
6186 	/* we're done with the filter list */
6187 	if_flt_monitor_unbusy(ifp);
6188 	lck_mtx_unlock(&ifp->if_flt_lock);
6189 
6190 	/* Get an io ref count if the interface is attached */
6191 	if (!ifnet_is_attached(ifp, 1)) {
6192 		goto done;
6193 	}
6194 
6195 	/*
6196 	 * An embedded tmp_list_entry in if_proto may still get
6197 	 * over-written by another thread after giving up ifnet lock,
6198 	 * therefore we are avoiding embedded pointers here.
6199 	 */
6200 	ifnet_lock_shared(ifp);
6201 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6202 	if (if_proto_count) {
6203 		int i;
6204 		VERIFY(ifp->if_proto_hash != NULL);
6205 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6206 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6207 		} else {
6208 			MALLOC(tmp_ifproto_arr, struct if_proto **,
6209 			    sizeof(*tmp_ifproto_arr) * if_proto_count,
6210 			    M_TEMP, M_ZERO);
6211 			if (tmp_ifproto_arr == NULL) {
6212 				ifnet_lock_done(ifp);
6213 				goto cleanup;
6214 			}
6215 			tmp_malloc = true;
6216 		}
6217 
6218 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6219 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6220 			    next_hash) {
6221 				if_proto_ref(proto);
6222 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6223 				tmp_ifproto_arr_idx++;
6224 			}
6225 		}
6226 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6227 	}
6228 	ifnet_lock_done(ifp);
6229 
6230 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6231 	    tmp_ifproto_arr_idx++) {
6232 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6233 		VERIFY(proto != NULL);
6234 		proto_media_event eventp =
6235 		    (proto->proto_kpi == kProtoKPI_v1 ?
6236 		    proto->kpi.v1.event :
6237 		    proto->kpi.v2.event);
6238 
6239 		if (eventp != NULL) {
6240 			eventp(ifp, proto->protocol_family,
6241 			    event);
6242 		}
6243 		if_proto_free(proto);
6244 	}
6245 
6246 cleanup:
6247 	if (tmp_malloc) {
6248 		FREE(tmp_ifproto_arr, M_TEMP);
6249 	}
6250 
6251 	/* Pass the event to the interface */
6252 	if (ifp->if_event != NULL) {
6253 		ifp->if_event(ifp, event);
6254 	}
6255 
6256 	/* Release the io ref count */
6257 	ifnet_decr_iorefcnt(ifp);
6258 done:
6259 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6260 }
6261 
6262 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6263 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6264 {
6265 	struct kev_msg kev_msg;
6266 	int result = 0;
6267 
6268 	if (ifp == NULL || event == NULL) {
6269 		return EINVAL;
6270 	}
6271 
6272 	bzero(&kev_msg, sizeof(kev_msg));
6273 	kev_msg.vendor_code = event->vendor_code;
6274 	kev_msg.kev_class = event->kev_class;
6275 	kev_msg.kev_subclass = event->kev_subclass;
6276 	kev_msg.event_code = event->event_code;
6277 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6278 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6279 	kev_msg.dv[1].data_length = 0;
6280 
6281 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6282 
6283 	return result;
6284 }
6285 
6286 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6287 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6288 {
6289 	mbuf_t  n = m;
6290 	int chainlen = 0;
6291 
6292 	while (n != NULL) {
6293 		chainlen++;
6294 		n = n->m_next;
6295 	}
6296 	switch (chainlen) {
6297 	case 0:
6298 		break;
6299 	case 1:
6300 		atomic_add_64(&cls->cls_one, 1);
6301 		break;
6302 	case 2:
6303 		atomic_add_64(&cls->cls_two, 1);
6304 		break;
6305 	case 3:
6306 		atomic_add_64(&cls->cls_three, 1);
6307 		break;
6308 	case 4:
6309 		atomic_add_64(&cls->cls_four, 1);
6310 		break;
6311 	case 5:
6312 	default:
6313 		atomic_add_64(&cls->cls_five_or_more, 1);
6314 		break;
6315 	}
6316 }
6317 
6318 #if CONFIG_DTRACE
6319 __attribute__((noinline))
6320 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6321 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6322 {
6323 	if (proto_family == PF_INET) {
6324 		struct ip *ip = mtod(m, struct ip *);
6325 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6326 		    struct ip *, ip, struct ifnet *, ifp,
6327 		    struct ip *, ip, struct ip6_hdr *, NULL);
6328 	} else if (proto_family == PF_INET6) {
6329 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6330 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6331 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6332 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6333 	}
6334 }
6335 #endif /* CONFIG_DTRACE */
6336 
6337 /*
6338  * dlil_output
6339  *
6340  * Caller should have a lock on the protocol domain if the protocol
6341  * doesn't support finer grained locking. In most cases, the lock
6342  * will be held from the socket layer and won't be released until
6343  * we return back to the socket layer.
6344  *
6345  * This does mean that we must take a protocol lock before we take
6346  * an interface lock if we're going to take both. This makes sense
6347  * because a protocol is likely to interact with an ifp while it
6348  * is under the protocol lock.
6349  *
6350  * An advisory code will be returned if adv is not null. This
6351  * can be used to provide feedback about interface queues to the
6352  * application.
6353  */
6354 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6355 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6356     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6357 {
6358 	char *frame_type = NULL;
6359 	char *dst_linkaddr = NULL;
6360 	int retval = 0;
6361 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6362 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6363 	struct if_proto *proto = NULL;
6364 	mbuf_t  m = NULL;
6365 	mbuf_t  send_head = NULL;
6366 	mbuf_t  *send_tail = &send_head;
6367 	int iorefcnt = 0;
6368 	u_int32_t pre = 0, post = 0;
6369 	u_int32_t fpkts = 0, fbytes = 0;
6370 	int32_t flen = 0;
6371 	struct timespec now;
6372 	u_int64_t now_nsec;
6373 	boolean_t did_clat46 = FALSE;
6374 	protocol_family_t old_proto_family = proto_family;
6375 	struct sockaddr_in6 dest6;
6376 	struct rtentry *rt = NULL;
6377 	u_int32_t m_loop_set = 0;
6378 
6379 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6380 
6381 	/*
6382 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6383 	 * from happening while this operation is in progress
6384 	 */
6385 	if (!ifnet_datamov_begin(ifp)) {
6386 		retval = ENXIO;
6387 		goto cleanup;
6388 	}
6389 	iorefcnt = 1;
6390 
6391 	VERIFY(ifp->if_output_dlil != NULL);
6392 
6393 	/* update the driver's multicast filter, if needed */
6394 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6395 		ifp->if_updatemcasts = 0;
6396 	}
6397 
6398 	frame_type = frame_type_buffer;
6399 	dst_linkaddr = dst_linkaddr_buffer;
6400 
6401 	if (raw == 0) {
6402 		ifnet_lock_shared(ifp);
6403 		/* callee holds a proto refcnt upon success */
6404 		proto = find_attached_proto(ifp, proto_family);
6405 		if (proto == NULL) {
6406 			ifnet_lock_done(ifp);
6407 			retval = ENXIO;
6408 			goto cleanup;
6409 		}
6410 		ifnet_lock_done(ifp);
6411 	}
6412 
6413 preout_again:
6414 	if (packetlist == NULL) {
6415 		goto cleanup;
6416 	}
6417 
6418 	m = packetlist;
6419 	packetlist = packetlist->m_nextpkt;
6420 	m->m_nextpkt = NULL;
6421 
6422 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6423 
6424 	/*
6425 	 * Perform address family translation for the first
6426 	 * packet outside the loop in order to perform address
6427 	 * lookup for the translated proto family.
6428 	 */
6429 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6430 	    (ifp->if_type == IFT_CELLULAR ||
6431 	    dlil_is_clat_needed(proto_family, m))) {
6432 		retval = dlil_clat46(ifp, &proto_family, &m);
6433 		/*
6434 		 * Go to the next packet if translation fails
6435 		 */
6436 		if (retval != 0) {
6437 			m_freem(m);
6438 			m = NULL;
6439 			ip6stat.ip6s_clat464_out_drop++;
6440 			/* Make sure that the proto family is PF_INET */
6441 			ASSERT(proto_family == PF_INET);
6442 			goto preout_again;
6443 		}
6444 		/*
6445 		 * Free the old one and make it point to the IPv6 proto structure.
6446 		 *
6447 		 * Change proto for the first time we have successfully
6448 		 * performed address family translation.
6449 		 */
6450 		if (!did_clat46 && proto_family == PF_INET6) {
6451 			did_clat46 = TRUE;
6452 
6453 			if (proto != NULL) {
6454 				if_proto_free(proto);
6455 			}
6456 			ifnet_lock_shared(ifp);
6457 			/* callee holds a proto refcnt upon success */
6458 			proto = find_attached_proto(ifp, proto_family);
6459 			if (proto == NULL) {
6460 				ifnet_lock_done(ifp);
6461 				retval = ENXIO;
6462 				m_freem(m);
6463 				m = NULL;
6464 				goto cleanup;
6465 			}
6466 			ifnet_lock_done(ifp);
6467 			if (ifp->if_type == IFT_ETHER) {
6468 				/* Update the dest to translated v6 address */
6469 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6470 				dest6.sin6_family = AF_INET6;
6471 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6472 				dest = (const struct sockaddr *)&dest6;
6473 
6474 				/*
6475 				 * Lookup route to the translated destination
6476 				 * Free this route ref during cleanup
6477 				 */
6478 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6479 				    0, 0, ifp->if_index);
6480 
6481 				route = rt;
6482 			}
6483 		}
6484 	}
6485 
6486 	/*
6487 	 * This path gets packet chain going to the same destination.
6488 	 * The pre output routine is used to either trigger resolution of
6489 	 * the next hop or retreive the next hop's link layer addressing.
6490 	 * For ex: ether_inet(6)_pre_output routine.
6491 	 *
6492 	 * If the routine returns EJUSTRETURN, it implies that packet has
6493 	 * been queued, and therefore we have to call preout_again for the
6494 	 * following packet in the chain.
6495 	 *
6496 	 * For errors other than EJUSTRETURN, the current packet is freed
6497 	 * and the rest of the chain (pointed by packetlist is freed as
6498 	 * part of clean up.
6499 	 *
6500 	 * Else if there is no error the retrieved information is used for
6501 	 * all the packets in the chain.
6502 	 */
6503 	if (raw == 0) {
6504 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6505 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6506 		retval = 0;
6507 		if (preoutp != NULL) {
6508 			retval = preoutp(ifp, proto_family, &m, dest, route,
6509 			    frame_type, dst_linkaddr);
6510 
6511 			if (retval != 0) {
6512 				if (retval == EJUSTRETURN) {
6513 					goto preout_again;
6514 				}
6515 				m_freem(m);
6516 				m = NULL;
6517 				goto cleanup;
6518 			}
6519 		}
6520 	}
6521 
6522 	do {
6523 		/*
6524 		 * pkt_hdr is set here to point to m_data prior to
6525 		 * calling into the framer. This value of pkt_hdr is
6526 		 * used by the netif gso logic to retrieve the ip header
6527 		 * for the TCP packets, offloaded for TSO processing.
6528 		 */
6529 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6530 			uint8_t vlan_encap_len = 0;
6531 
6532 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6533 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6534 			}
6535 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6536 		} else {
6537 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6538 		}
6539 
6540 		/*
6541 		 * Perform address family translation if needed.
6542 		 * For now we only support stateless 4 to 6 translation
6543 		 * on the out path.
6544 		 *
6545 		 * The routine below translates IP header, updates protocol
6546 		 * checksum and also translates ICMP.
6547 		 *
6548 		 * We skip the first packet as it is already translated and
6549 		 * the proto family is set to PF_INET6.
6550 		 */
6551 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6552 		    (ifp->if_type == IFT_CELLULAR ||
6553 		    dlil_is_clat_needed(proto_family, m))) {
6554 			retval = dlil_clat46(ifp, &proto_family, &m);
6555 			/* Goto the next packet if the translation fails */
6556 			if (retval != 0) {
6557 				m_freem(m);
6558 				m = NULL;
6559 				ip6stat.ip6s_clat464_out_drop++;
6560 				goto next;
6561 			}
6562 		}
6563 
6564 #if CONFIG_DTRACE
6565 		if (!raw) {
6566 			dlil_output_dtrace(ifp, proto_family, m);
6567 		}
6568 #endif /* CONFIG_DTRACE */
6569 
6570 		if (raw == 0 && ifp->if_framer != NULL) {
6571 			int rcvif_set = 0;
6572 
6573 			/*
6574 			 * If this is a broadcast packet that needs to be
6575 			 * looped back into the system, set the inbound ifp
6576 			 * to that of the outbound ifp.  This will allow
6577 			 * us to determine that it is a legitimate packet
6578 			 * for the system.  Only set the ifp if it's not
6579 			 * already set, just to be safe.
6580 			 */
6581 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6582 			    m->m_pkthdr.rcvif == NULL) {
6583 				m->m_pkthdr.rcvif = ifp;
6584 				rcvif_set = 1;
6585 			}
6586 			m_loop_set = m->m_flags & M_LOOP;
6587 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6588 			    frame_type, &pre, &post);
6589 			if (retval != 0) {
6590 				if (retval != EJUSTRETURN) {
6591 					m_freem(m);
6592 				}
6593 				goto next;
6594 			}
6595 
6596 			/*
6597 			 * For partial checksum offload, adjust the start
6598 			 * and stuff offsets based on the prepended header.
6599 			 */
6600 			if ((m->m_pkthdr.csum_flags &
6601 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6602 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6603 				m->m_pkthdr.csum_tx_stuff += pre;
6604 				m->m_pkthdr.csum_tx_start += pre;
6605 			}
6606 
6607 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6608 				dlil_output_cksum_dbg(ifp, m, pre,
6609 				    proto_family);
6610 			}
6611 
6612 			/*
6613 			 * Clear the ifp if it was set above, and to be
6614 			 * safe, only if it is still the same as the
6615 			 * outbound ifp we have in context.  If it was
6616 			 * looped back, then a copy of it was sent to the
6617 			 * loopback interface with the rcvif set, and we
6618 			 * are clearing the one that will go down to the
6619 			 * layer below.
6620 			 */
6621 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6622 				m->m_pkthdr.rcvif = NULL;
6623 			}
6624 		}
6625 
6626 		/*
6627 		 * Let interface filters (if any) do their thing ...
6628 		 */
6629 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6630 		if (retval != 0) {
6631 			if (retval != EJUSTRETURN) {
6632 				m_freem(m);
6633 			}
6634 			goto next;
6635 		}
6636 		/*
6637 		 * Strip away M_PROTO1 bit prior to sending packet
6638 		 * to the driver as this field may be used by the driver
6639 		 */
6640 		m->m_flags &= ~M_PROTO1;
6641 
6642 		/*
6643 		 * If the underlying interface is not capable of handling a
6644 		 * packet whose data portion spans across physically disjoint
6645 		 * pages, we need to "normalize" the packet so that we pass
6646 		 * down a chain of mbufs where each mbuf points to a span that
6647 		 * resides in the system page boundary.  If the packet does
6648 		 * not cross page(s), the following is a no-op.
6649 		 */
6650 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6651 			if ((m = m_normalize(m)) == NULL) {
6652 				goto next;
6653 			}
6654 		}
6655 
6656 		/*
6657 		 * If this is a TSO packet, make sure the interface still
6658 		 * advertise TSO capability.
6659 		 */
6660 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6661 			retval = EMSGSIZE;
6662 			m_freem(m);
6663 			goto cleanup;
6664 		}
6665 
6666 		ifp_inc_traffic_class_out(ifp, m);
6667 
6668 #if SKYWALK
6669 		/*
6670 		 * For native skywalk devices, packets will be passed to pktap
6671 		 * after GSO or after the mbuf to packet conversion.
6672 		 * This is done for IPv4/IPv6 packets only because there is no
6673 		 * space in the mbuf to pass down the proto family.
6674 		 */
6675 		if (dlil_is_native_netif_nexus(ifp)) {
6676 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6677 				pktap_output(ifp, proto_family, m, pre, post);
6678 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6679 			}
6680 		} else {
6681 			pktap_output(ifp, proto_family, m, pre, post);
6682 		}
6683 #else /* SKYWALK */
6684 		pktap_output(ifp, proto_family, m, pre, post);
6685 #endif /* SKYWALK */
6686 
6687 		/*
6688 		 * Count the number of elements in the mbuf chain
6689 		 */
6690 		if (tx_chain_len_count) {
6691 			dlil_count_chain_len(m, &tx_chain_len_stats);
6692 		}
6693 
6694 		/*
6695 		 * Record timestamp; ifnet_enqueue() will use this info
6696 		 * rather than redoing the work.  An optimization could
6697 		 * involve doing this just once at the top, if there are
6698 		 * no interface filters attached, but that's probably
6699 		 * not a big deal.
6700 		 */
6701 		nanouptime(&now);
6702 		net_timernsec(&now, &now_nsec);
6703 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6704 
6705 		/*
6706 		 * Discard partial sum information if this packet originated
6707 		 * from another interface; the packet would already have the
6708 		 * final checksum and we shouldn't recompute it.
6709 		 */
6710 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6711 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6712 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6713 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6714 			m->m_pkthdr.csum_data = 0;
6715 		}
6716 
6717 		/*
6718 		 * Finally, call the driver.
6719 		 */
6720 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6721 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6722 				flen += (m_pktlen(m) - (pre + post));
6723 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6724 			}
6725 			*send_tail = m;
6726 			send_tail = &m->m_nextpkt;
6727 		} else {
6728 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6729 				flen = (m_pktlen(m) - (pre + post));
6730 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6731 			} else {
6732 				flen = 0;
6733 			}
6734 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6735 			    0, 0, 0, 0, 0);
6736 			retval = (*ifp->if_output_dlil)(ifp, m);
6737 			if (retval == EQFULL || retval == EQSUSPENDED) {
6738 				if (adv != NULL && adv->code == FADV_SUCCESS) {
6739 					adv->code = (retval == EQFULL ?
6740 					    FADV_FLOW_CONTROLLED :
6741 					    FADV_SUSPENDED);
6742 				}
6743 				retval = 0;
6744 			}
6745 			if (retval == 0 && flen > 0) {
6746 				fbytes += flen;
6747 				fpkts++;
6748 			}
6749 			if (retval != 0 && dlil_verbose) {
6750 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6751 				    __func__, if_name(ifp),
6752 				    retval);
6753 			}
6754 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6755 			    0, 0, 0, 0, 0);
6756 		}
6757 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6758 
6759 next:
6760 		m = packetlist;
6761 		if (m != NULL) {
6762 			m->m_flags |= m_loop_set;
6763 			packetlist = packetlist->m_nextpkt;
6764 			m->m_nextpkt = NULL;
6765 		}
6766 		/* Reset the proto family to old proto family for CLAT */
6767 		if (did_clat46) {
6768 			proto_family = old_proto_family;
6769 		}
6770 	} while (m != NULL);
6771 
6772 	if (send_head != NULL) {
6773 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6774 		    0, 0, 0, 0, 0);
6775 		if (ifp->if_eflags & IFEF_SENDLIST) {
6776 			retval = (*ifp->if_output_dlil)(ifp, send_head);
6777 			if (retval == EQFULL || retval == EQSUSPENDED) {
6778 				if (adv != NULL) {
6779 					adv->code = (retval == EQFULL ?
6780 					    FADV_FLOW_CONTROLLED :
6781 					    FADV_SUSPENDED);
6782 				}
6783 				retval = 0;
6784 			}
6785 			if (retval == 0 && flen > 0) {
6786 				fbytes += flen;
6787 				fpkts++;
6788 			}
6789 			if (retval != 0 && dlil_verbose) {
6790 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6791 				    __func__, if_name(ifp), retval);
6792 			}
6793 		} else {
6794 			struct mbuf *send_m;
6795 			int enq_cnt = 0;
6796 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6797 			while (send_head != NULL) {
6798 				send_m = send_head;
6799 				send_head = send_m->m_nextpkt;
6800 				send_m->m_nextpkt = NULL;
6801 				retval = (*ifp->if_output_dlil)(ifp, send_m);
6802 				if (retval == EQFULL || retval == EQSUSPENDED) {
6803 					if (adv != NULL) {
6804 						adv->code = (retval == EQFULL ?
6805 						    FADV_FLOW_CONTROLLED :
6806 						    FADV_SUSPENDED);
6807 					}
6808 					retval = 0;
6809 				}
6810 				if (retval == 0) {
6811 					enq_cnt++;
6812 					if (flen > 0) {
6813 						fpkts++;
6814 					}
6815 				}
6816 				if (retval != 0 && dlil_verbose) {
6817 					DLIL_PRINTF("%s: output error on %s "
6818 					    "retval = %d\n",
6819 					    __func__, if_name(ifp), retval);
6820 				}
6821 			}
6822 			if (enq_cnt > 0) {
6823 				fbytes += flen;
6824 				ifnet_start(ifp);
6825 			}
6826 		}
6827 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6828 	}
6829 
6830 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6831 
6832 cleanup:
6833 	if (fbytes > 0) {
6834 		ifp->if_fbytes += fbytes;
6835 	}
6836 	if (fpkts > 0) {
6837 		ifp->if_fpackets += fpkts;
6838 	}
6839 	if (proto != NULL) {
6840 		if_proto_free(proto);
6841 	}
6842 	if (packetlist) { /* if any packets are left, clean up */
6843 		mbuf_freem_list(packetlist);
6844 	}
6845 	if (retval == EJUSTRETURN) {
6846 		retval = 0;
6847 	}
6848 	if (iorefcnt == 1) {
6849 		ifnet_datamov_end(ifp);
6850 	}
6851 	if (rt != NULL) {
6852 		rtfree(rt);
6853 		rt = NULL;
6854 	}
6855 
6856 	return retval;
6857 }
6858 
6859 /*
6860  * This routine checks if the destination address is not a loopback, link-local,
6861  * multicast or broadcast address.
6862  */
6863 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)6864 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
6865 {
6866 	int ret = 0;
6867 	switch (proto_family) {
6868 	case PF_INET: {
6869 		struct ip *iph = mtod(m, struct ip *);
6870 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
6871 			ret = 1;
6872 		}
6873 		break;
6874 	}
6875 	case PF_INET6: {
6876 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
6877 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
6878 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
6879 			ret = 1;
6880 		}
6881 		break;
6882 	}
6883 	}
6884 
6885 	return ret;
6886 }
6887 /*
6888  * @brief This routine translates IPv4 packet to IPv6 packet,
6889  *     updates protocol checksum and also translates ICMP for code
6890  *     along with inner header translation.
6891  *
6892  * @param ifp Pointer to the interface
6893  * @param proto_family pointer to protocol family. It is updated if function
6894  *     performs the translation successfully.
6895  * @param m Pointer to the pointer pointing to the packet. Needed because this
6896  *     routine can end up changing the mbuf to a different one.
6897  *
6898  * @return 0 on success or else a negative value.
6899  */
6900 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)6901 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
6902 {
6903 	VERIFY(*proto_family == PF_INET);
6904 	VERIFY(IS_INTF_CLAT46(ifp));
6905 
6906 	pbuf_t pbuf_store, *pbuf = NULL;
6907 	struct ip *iph = NULL;
6908 	struct in_addr osrc, odst;
6909 	uint8_t proto = 0;
6910 	struct in6_ifaddr *ia6_clat_src = NULL;
6911 	struct in6_addr *src = NULL;
6912 	struct in6_addr dst;
6913 	int error = 0;
6914 	uint16_t off = 0;
6915 	uint16_t tot_len = 0;
6916 	uint16_t ip_id_val = 0;
6917 	uint16_t ip_frag_off = 0;
6918 
6919 	boolean_t is_frag = FALSE;
6920 	boolean_t is_first_frag = TRUE;
6921 	boolean_t is_last_frag = TRUE;
6922 
6923 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
6924 	pbuf = &pbuf_store;
6925 	iph = pbuf->pb_data;
6926 
6927 	osrc = iph->ip_src;
6928 	odst = iph->ip_dst;
6929 	proto = iph->ip_p;
6930 	off = (uint16_t)(iph->ip_hl << 2);
6931 	ip_id_val = iph->ip_id;
6932 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
6933 
6934 	tot_len = ntohs(iph->ip_len);
6935 
6936 	/*
6937 	 * For packets that are not first frags
6938 	 * we only need to adjust CSUM.
6939 	 * For 4 to 6, Fragmentation header gets appended
6940 	 * after proto translation.
6941 	 */
6942 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
6943 		is_frag = TRUE;
6944 
6945 		/* If the offset is not zero, it is not first frag */
6946 		if (ip_frag_off != 0) {
6947 			is_first_frag = FALSE;
6948 		}
6949 
6950 		/* If IP_MF is set, then it is not last frag */
6951 		if (ntohs(iph->ip_off) & IP_MF) {
6952 			is_last_frag = FALSE;
6953 		}
6954 	}
6955 
6956 	/*
6957 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
6958 	 * translation.
6959 	 */
6960 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
6961 	if (ia6_clat_src == NULL) {
6962 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
6963 		error = -1;
6964 		goto cleanup;
6965 	}
6966 
6967 	src = &ia6_clat_src->ia_addr.sin6_addr;
6968 
6969 	/*
6970 	 * Translate IPv4 destination to IPv6 destination by using the
6971 	 * prefixes learned through prior PLAT discovery.
6972 	 */
6973 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
6974 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
6975 		goto cleanup;
6976 	}
6977 
6978 	/* Translate the IP header part first */
6979 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
6980 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
6981 
6982 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
6983 
6984 	if (error != 0) {
6985 		ip6stat.ip6s_clat464_out_46transfail_drop++;
6986 		goto cleanup;
6987 	}
6988 
6989 	/*
6990 	 * Translate protocol header, update checksum, checksum flags
6991 	 * and related fields.
6992 	 */
6993 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
6994 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
6995 
6996 	if (error != 0) {
6997 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
6998 		goto cleanup;
6999 	}
7000 
7001 	/* Now insert the IPv6 fragment header */
7002 	if (is_frag) {
7003 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7004 
7005 		if (error != 0) {
7006 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7007 			goto cleanup;
7008 		}
7009 	}
7010 
7011 cleanup:
7012 	if (ia6_clat_src != NULL) {
7013 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7014 	}
7015 
7016 	if (pbuf_is_valid(pbuf)) {
7017 		*m = pbuf->pb_mbuf;
7018 		pbuf->pb_mbuf = NULL;
7019 		pbuf_destroy(pbuf);
7020 	} else {
7021 		error = -1;
7022 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7023 	}
7024 
7025 	if (error == 0) {
7026 		*proto_family = PF_INET6;
7027 		ip6stat.ip6s_clat464_out_success++;
7028 	}
7029 
7030 	return error;
7031 }
7032 
7033 /*
7034  * @brief This routine translates incoming IPv6 to IPv4 packet,
7035  *     updates protocol checksum and also translates ICMPv6 outer
7036  *     and inner headers
7037  *
7038  * @return 0 on success or else a negative value.
7039  */
7040 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7041 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7042 {
7043 	VERIFY(*proto_family == PF_INET6);
7044 	VERIFY(IS_INTF_CLAT46(ifp));
7045 
7046 	struct ip6_hdr *ip6h = NULL;
7047 	struct in6_addr osrc, odst;
7048 	uint8_t proto = 0;
7049 	struct in6_ifaddr *ia6_clat_dst = NULL;
7050 	struct in_ifaddr *ia4_clat_dst = NULL;
7051 	struct in_addr *dst = NULL;
7052 	struct in_addr src;
7053 	int error = 0;
7054 	uint32_t off = 0;
7055 	u_int64_t tot_len = 0;
7056 	uint8_t tos = 0;
7057 	boolean_t is_first_frag = TRUE;
7058 
7059 	/* Incoming mbuf does not contain valid IP6 header */
7060 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7061 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7062 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7063 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7064 		return -1;
7065 	}
7066 
7067 	ip6h = mtod(*m, struct ip6_hdr *);
7068 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7069 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7070 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7071 		return -1;
7072 	}
7073 
7074 	osrc = ip6h->ip6_src;
7075 	odst = ip6h->ip6_dst;
7076 
7077 	/*
7078 	 * Retrieve the local CLAT46 reserved IPv6 address.
7079 	 * Let the packet pass if we don't find one, as the flag
7080 	 * may get set before IPv6 configuration has taken place.
7081 	 */
7082 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7083 	if (ia6_clat_dst == NULL) {
7084 		goto done;
7085 	}
7086 
7087 	/*
7088 	 * Check if the original dest in the packet is same as the reserved
7089 	 * CLAT46 IPv6 address
7090 	 */
7091 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7092 		pbuf_t pbuf_store, *pbuf = NULL;
7093 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7094 		pbuf = &pbuf_store;
7095 
7096 		/*
7097 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7098 		 * translation.
7099 		 */
7100 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7101 		if (ia4_clat_dst == NULL) {
7102 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7103 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7104 			error = -1;
7105 			goto cleanup;
7106 		}
7107 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7108 
7109 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7110 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7111 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7112 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7113 			error = -1;
7114 			goto cleanup;
7115 		}
7116 
7117 		ip6h = pbuf->pb_data;
7118 		off = sizeof(struct ip6_hdr);
7119 		proto = ip6h->ip6_nxt;
7120 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7121 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7122 
7123 		/*
7124 		 * Translate the IP header and update the fragmentation
7125 		 * header if needed
7126 		 */
7127 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7128 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7129 		    0 : -1;
7130 
7131 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7132 
7133 		if (error != 0) {
7134 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7135 			goto cleanup;
7136 		}
7137 
7138 		/*
7139 		 * Translate protocol header, update checksum, checksum flags
7140 		 * and related fields.
7141 		 */
7142 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7143 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7144 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7145 
7146 		if (error != 0) {
7147 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7148 			goto cleanup;
7149 		}
7150 
7151 cleanup:
7152 		if (ia4_clat_dst != NULL) {
7153 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7154 		}
7155 
7156 		if (pbuf_is_valid(pbuf)) {
7157 			*m = pbuf->pb_mbuf;
7158 			pbuf->pb_mbuf = NULL;
7159 			pbuf_destroy(pbuf);
7160 		} else {
7161 			error = -1;
7162 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7163 		}
7164 
7165 		if (error == 0) {
7166 			*proto_family = PF_INET;
7167 			ip6stat.ip6s_clat464_in_success++;
7168 		}
7169 	} /* CLAT traffic */
7170 
7171 done:
7172 	return error;
7173 }
7174 
7175 /* The following is used to enqueue work items for ifnet ioctl events */
7176 static void ifnet_ioctl_event_callback(void *);
7177 
7178 struct ifnet_ioctl_event {
7179 	struct ifnet *ifp;
7180 	u_long ioctl_code;
7181 };
7182 
7183 struct ifnet_ioctl_event_nwk_wq_entry {
7184 	struct nwk_wq_entry nwk_wqe;
7185 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7186 };
7187 
7188 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7189 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7190 {
7191 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7192 
7193 	/*
7194 	 * Get an io ref count if the interface is attached.
7195 	 * At this point it most likely is. We are taking a reference for
7196 	 * deferred processing.
7197 	 */
7198 	if (!ifnet_is_attached(ifp, 1)) {
7199 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7200 		    "is not attached",
7201 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7202 		return;
7203 	}
7204 
7205 	MALLOC(p_ifnet_ioctl_ev, struct ifnet_ioctl_event_nwk_wq_entry *,
7206 	    sizeof(struct ifnet_ioctl_event_nwk_wq_entry),
7207 	    M_NWKWQ, M_WAITOK | M_ZERO);
7208 
7209 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7210 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7211 
7212 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7213 	p_ifnet_ioctl_ev->nwk_wqe.is_arg_managed = TRUE;
7214 	p_ifnet_ioctl_ev->nwk_wqe.arg = &p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg;
7215 	nwk_wq_enqueue((struct nwk_wq_entry*)p_ifnet_ioctl_ev);
7216 }
7217 
7218 static void
ifnet_ioctl_event_callback(void * arg)7219 ifnet_ioctl_event_callback(void *arg)
7220 {
7221 	struct ifnet_ioctl_event *p_ifnet_ioctl_ev = (struct ifnet_ioctl_event *)arg;
7222 	struct ifnet *ifp = p_ifnet_ioctl_ev->ifp;
7223 	u_long ioctl_code = p_ifnet_ioctl_ev->ioctl_code;
7224 	int ret = 0;
7225 
7226 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7227 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7228 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7229 	} else if (dlil_verbose) {
7230 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7231 		    "for ioctl %lu",
7232 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7233 	}
7234 	ifnet_decr_iorefcnt(ifp);
7235 	return;
7236 }
7237 
7238 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7239 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7240     void *ioctl_arg)
7241 {
7242 	struct ifnet_filter *filter;
7243 	int retval = EOPNOTSUPP;
7244 	int result = 0;
7245 
7246 	if (ifp == NULL || ioctl_code == 0) {
7247 		return EINVAL;
7248 	}
7249 
7250 	/* Get an io ref count if the interface is attached */
7251 	if (!ifnet_is_attached(ifp, 1)) {
7252 		return EOPNOTSUPP;
7253 	}
7254 
7255 	/*
7256 	 * Run the interface filters first.
7257 	 * We want to run all filters before calling the protocol,
7258 	 * interface family, or interface.
7259 	 */
7260 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7261 	/* prevent filter list from changing in case we drop the lock */
7262 	if_flt_monitor_busy(ifp);
7263 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7264 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7265 		    filter->filt_protocol == proto_fam)) {
7266 			lck_mtx_unlock(&ifp->if_flt_lock);
7267 
7268 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7269 			    proto_fam, ioctl_code, ioctl_arg);
7270 
7271 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7272 
7273 			/* Only update retval if no one has handled the ioctl */
7274 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7275 				if (result == ENOTSUP) {
7276 					result = EOPNOTSUPP;
7277 				}
7278 				retval = result;
7279 				if (retval != 0 && retval != EOPNOTSUPP) {
7280 					/* we're done with the filter list */
7281 					if_flt_monitor_unbusy(ifp);
7282 					lck_mtx_unlock(&ifp->if_flt_lock);
7283 					goto cleanup;
7284 				}
7285 			}
7286 		}
7287 	}
7288 	/* we're done with the filter list */
7289 	if_flt_monitor_unbusy(ifp);
7290 	lck_mtx_unlock(&ifp->if_flt_lock);
7291 
7292 	/* Allow the protocol to handle the ioctl */
7293 	if (proto_fam != 0) {
7294 		struct if_proto *proto;
7295 
7296 		/* callee holds a proto refcnt upon success */
7297 		ifnet_lock_shared(ifp);
7298 		proto = find_attached_proto(ifp, proto_fam);
7299 		ifnet_lock_done(ifp);
7300 		if (proto != NULL) {
7301 			proto_media_ioctl ioctlp =
7302 			    (proto->proto_kpi == kProtoKPI_v1 ?
7303 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7304 			result = EOPNOTSUPP;
7305 			if (ioctlp != NULL) {
7306 				result = ioctlp(ifp, proto_fam, ioctl_code,
7307 				    ioctl_arg);
7308 			}
7309 			if_proto_free(proto);
7310 
7311 			/* Only update retval if no one has handled the ioctl */
7312 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7313 				if (result == ENOTSUP) {
7314 					result = EOPNOTSUPP;
7315 				}
7316 				retval = result;
7317 				if (retval && retval != EOPNOTSUPP) {
7318 					goto cleanup;
7319 				}
7320 			}
7321 		}
7322 	}
7323 
7324 	/* retval is either 0 or EOPNOTSUPP */
7325 
7326 	/*
7327 	 * Let the interface handle this ioctl.
7328 	 * If it returns EOPNOTSUPP, ignore that, we may have
7329 	 * already handled this in the protocol or family.
7330 	 */
7331 	if (ifp->if_ioctl) {
7332 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7333 	}
7334 
7335 	/* Only update retval if no one has handled the ioctl */
7336 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7337 		if (result == ENOTSUP) {
7338 			result = EOPNOTSUPP;
7339 		}
7340 		retval = result;
7341 		if (retval && retval != EOPNOTSUPP) {
7342 			goto cleanup;
7343 		}
7344 	}
7345 
7346 cleanup:
7347 	if (retval == EJUSTRETURN) {
7348 		retval = 0;
7349 	}
7350 
7351 	ifnet_decr_iorefcnt(ifp);
7352 
7353 	return retval;
7354 }
7355 
7356 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7357 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7358 {
7359 	errno_t error = 0;
7360 
7361 
7362 	if (ifp->if_set_bpf_tap) {
7363 		/* Get an io reference on the interface if it is attached */
7364 		if (!ifnet_is_attached(ifp, 1)) {
7365 			return ENXIO;
7366 		}
7367 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7368 		ifnet_decr_iorefcnt(ifp);
7369 	}
7370 	return error;
7371 }
7372 
7373 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7374 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7375     struct sockaddr *ll_addr, size_t ll_len)
7376 {
7377 	errno_t result = EOPNOTSUPP;
7378 	struct if_proto *proto;
7379 	const struct sockaddr *verify;
7380 	proto_media_resolve_multi resolvep;
7381 
7382 	if (!ifnet_is_attached(ifp, 1)) {
7383 		return result;
7384 	}
7385 
7386 	bzero(ll_addr, ll_len);
7387 
7388 	/* Call the protocol first; callee holds a proto refcnt upon success */
7389 	ifnet_lock_shared(ifp);
7390 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7391 	ifnet_lock_done(ifp);
7392 	if (proto != NULL) {
7393 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7394 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7395 		if (resolvep != NULL) {
7396 			result = resolvep(ifp, proto_addr,
7397 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7398 		}
7399 		if_proto_free(proto);
7400 	}
7401 
7402 	/* Let the interface verify the multicast address */
7403 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7404 		if (result == 0) {
7405 			verify = ll_addr;
7406 		} else {
7407 			verify = proto_addr;
7408 		}
7409 		result = ifp->if_check_multi(ifp, verify);
7410 	}
7411 
7412 	ifnet_decr_iorefcnt(ifp);
7413 	return result;
7414 }
7415 
7416 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7417 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7418     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7419     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7420 {
7421 	struct if_proto *proto;
7422 	errno_t result = 0;
7423 
7424 	/* callee holds a proto refcnt upon success */
7425 	ifnet_lock_shared(ifp);
7426 	proto = find_attached_proto(ifp, target_proto->sa_family);
7427 	ifnet_lock_done(ifp);
7428 	if (proto == NULL) {
7429 		result = ENOTSUP;
7430 	} else {
7431 		proto_media_send_arp    arpp;
7432 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7433 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7434 		if (arpp == NULL) {
7435 			result = ENOTSUP;
7436 		} else {
7437 			switch (arpop) {
7438 			case ARPOP_REQUEST:
7439 				arpstat.txrequests++;
7440 				if (target_hw != NULL) {
7441 					arpstat.txurequests++;
7442 				}
7443 				break;
7444 			case ARPOP_REPLY:
7445 				arpstat.txreplies++;
7446 				break;
7447 			}
7448 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7449 			    target_hw, target_proto);
7450 		}
7451 		if_proto_free(proto);
7452 	}
7453 
7454 	return result;
7455 }
7456 
7457 struct net_thread_marks { };
7458 static const struct net_thread_marks net_thread_marks_base = { };
7459 
7460 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7461     &net_thread_marks_base;
7462 
7463 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7464 net_thread_marks_push(u_int32_t push)
7465 {
7466 	static const char *const base = (const void*)&net_thread_marks_base;
7467 	u_int32_t pop = 0;
7468 
7469 	if (push != 0) {
7470 		struct uthread *uth = current_uthread();
7471 
7472 		pop = push & ~uth->uu_network_marks;
7473 		if (pop != 0) {
7474 			uth->uu_network_marks |= pop;
7475 		}
7476 	}
7477 
7478 	return (net_thread_marks_t)&base[pop];
7479 }
7480 
7481 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7482 net_thread_unmarks_push(u_int32_t unpush)
7483 {
7484 	static const char *const base = (const void*)&net_thread_marks_base;
7485 	u_int32_t unpop = 0;
7486 
7487 	if (unpush != 0) {
7488 		struct uthread *uth = current_uthread();
7489 
7490 		unpop = unpush & uth->uu_network_marks;
7491 		if (unpop != 0) {
7492 			uth->uu_network_marks &= ~unpop;
7493 		}
7494 	}
7495 
7496 	return (net_thread_marks_t)&base[unpop];
7497 }
7498 
7499 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7500 net_thread_marks_pop(net_thread_marks_t popx)
7501 {
7502 	static const char *const base = (const void*)&net_thread_marks_base;
7503 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7504 
7505 	if (pop != 0) {
7506 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7507 		struct uthread *uth = current_uthread();
7508 
7509 		VERIFY((pop & ones) == pop);
7510 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7511 		uth->uu_network_marks &= ~pop;
7512 	}
7513 }
7514 
7515 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7516 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7517 {
7518 	static const char *const base = (const void*)&net_thread_marks_base;
7519 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7520 
7521 	if (unpop != 0) {
7522 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7523 		struct uthread *uth = current_uthread();
7524 
7525 		VERIFY((unpop & ones) == unpop);
7526 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7527 		uth->uu_network_marks |= unpop;
7528 	}
7529 }
7530 
7531 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7532 net_thread_is_marked(u_int32_t check)
7533 {
7534 	if (check != 0) {
7535 		struct uthread *uth = current_uthread();
7536 		return uth->uu_network_marks & check;
7537 	} else {
7538 		return 0;
7539 	}
7540 }
7541 
7542 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7543 net_thread_is_unmarked(u_int32_t check)
7544 {
7545 	if (check != 0) {
7546 		struct uthread *uth = current_uthread();
7547 		return ~uth->uu_network_marks & check;
7548 	} else {
7549 		return 0;
7550 	}
7551 }
7552 
7553 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7554 _is_announcement(const struct sockaddr_in * sender_sin,
7555     const struct sockaddr_in * target_sin)
7556 {
7557 	if (target_sin == NULL || sender_sin == NULL) {
7558 		return FALSE;
7559 	}
7560 
7561 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7562 }
7563 
7564 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7565 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7566     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7567     const struct sockaddr *target_proto0, u_int32_t rtflags)
7568 {
7569 	errno_t result = 0;
7570 	const struct sockaddr_in * sender_sin;
7571 	const struct sockaddr_in * target_sin;
7572 	struct sockaddr_inarp target_proto_sinarp;
7573 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7574 
7575 	if (target_proto == NULL || sender_proto == NULL) {
7576 		return EINVAL;
7577 	}
7578 
7579 	if (sender_proto->sa_family != target_proto->sa_family) {
7580 		return EINVAL;
7581 	}
7582 
7583 	/*
7584 	 * If the target is a (default) router, provide that
7585 	 * information to the send_arp callback routine.
7586 	 */
7587 	if (rtflags & RTF_ROUTER) {
7588 		bcopy(target_proto, &target_proto_sinarp,
7589 		    sizeof(struct sockaddr_in));
7590 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7591 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7592 	}
7593 
7594 	/*
7595 	 * If this is an ARP request and the target IP is IPv4LL,
7596 	 * send the request on all interfaces.  The exception is
7597 	 * an announcement, which must only appear on the specific
7598 	 * interface.
7599 	 */
7600 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7601 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7602 	if (target_proto->sa_family == AF_INET &&
7603 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7604 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7605 	    !_is_announcement(sender_sin, target_sin)) {
7606 		ifnet_t         *ifp_list;
7607 		u_int32_t       count;
7608 		u_int32_t       ifp_on;
7609 
7610 		result = ENOTSUP;
7611 
7612 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7613 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7614 				errno_t new_result;
7615 				ifaddr_t source_hw = NULL;
7616 				ifaddr_t source_ip = NULL;
7617 				struct sockaddr_in source_ip_copy;
7618 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7619 
7620 				/*
7621 				 * Only arp on interfaces marked for IPv4LL
7622 				 * ARPing.  This may mean that we don't ARP on
7623 				 * the interface the subnet route points to.
7624 				 */
7625 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7626 					continue;
7627 				}
7628 
7629 				/* Find the source IP address */
7630 				ifnet_lock_shared(cur_ifp);
7631 				source_hw = cur_ifp->if_lladdr;
7632 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7633 				    ifa_link) {
7634 					IFA_LOCK(source_ip);
7635 					if (source_ip->ifa_addr != NULL &&
7636 					    source_ip->ifa_addr->sa_family ==
7637 					    AF_INET) {
7638 						/* Copy the source IP address */
7639 						source_ip_copy =
7640 						    *(struct sockaddr_in *)
7641 						    (void *)source_ip->ifa_addr;
7642 						IFA_UNLOCK(source_ip);
7643 						break;
7644 					}
7645 					IFA_UNLOCK(source_ip);
7646 				}
7647 
7648 				/* No IP Source, don't arp */
7649 				if (source_ip == NULL) {
7650 					ifnet_lock_done(cur_ifp);
7651 					continue;
7652 				}
7653 
7654 				IFA_ADDREF(source_hw);
7655 				ifnet_lock_done(cur_ifp);
7656 
7657 				/* Send the ARP */
7658 				new_result = dlil_send_arp_internal(cur_ifp,
7659 				    arpop, (struct sockaddr_dl *)(void *)
7660 				    source_hw->ifa_addr,
7661 				    (struct sockaddr *)&source_ip_copy, NULL,
7662 				    target_proto);
7663 
7664 				IFA_REMREF(source_hw);
7665 				if (result == ENOTSUP) {
7666 					result = new_result;
7667 				}
7668 			}
7669 			ifnet_list_free(ifp_list);
7670 		}
7671 	} else {
7672 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7673 		    sender_proto, target_hw, target_proto);
7674 	}
7675 
7676 	return result;
7677 }
7678 
7679 /*
7680  * Caller must hold ifnet head lock.
7681  */
7682 static int
ifnet_lookup(struct ifnet * ifp)7683 ifnet_lookup(struct ifnet *ifp)
7684 {
7685 	struct ifnet *_ifp;
7686 
7687 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7688 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7689 		if (_ifp == ifp) {
7690 			break;
7691 		}
7692 	}
7693 	return _ifp != NULL;
7694 }
7695 
7696 /*
7697  * Caller has to pass a non-zero refio argument to get a
7698  * IO reference count. This will prevent ifnet_detach from
7699  * being called when there are outstanding io reference counts.
7700  */
7701 int
ifnet_is_attached(struct ifnet * ifp,int refio)7702 ifnet_is_attached(struct ifnet *ifp, int refio)
7703 {
7704 	int ret;
7705 
7706 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7707 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7708 		if (refio > 0) {
7709 			ifp->if_refio++;
7710 		}
7711 	}
7712 	lck_mtx_unlock(&ifp->if_ref_lock);
7713 
7714 	return ret;
7715 }
7716 
7717 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7718 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7719 {
7720 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7721 	ifp->if_threads_pending++;
7722 	lck_mtx_unlock(&ifp->if_ref_lock);
7723 }
7724 
7725 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7726 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7727 {
7728 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7729 	VERIFY(ifp->if_threads_pending > 0);
7730 	ifp->if_threads_pending--;
7731 	if (ifp->if_threads_pending == 0) {
7732 		wakeup(&ifp->if_threads_pending);
7733 	}
7734 	lck_mtx_unlock(&ifp->if_ref_lock);
7735 }
7736 
7737 /*
7738  * Caller must ensure the interface is attached; the assumption is that
7739  * there is at least an outstanding IO reference count held already.
7740  * Most callers would call ifnet_is_{attached,data_ready}() instead.
7741  */
7742 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7743 ifnet_incr_iorefcnt(struct ifnet *ifp)
7744 {
7745 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7746 	VERIFY(IF_FULLY_ATTACHED(ifp));
7747 	VERIFY(ifp->if_refio > 0);
7748 	ifp->if_refio++;
7749 	lck_mtx_unlock(&ifp->if_ref_lock);
7750 }
7751 
7752 __attribute__((always_inline))
7753 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7754 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7755 {
7756 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7757 
7758 	VERIFY(ifp->if_refio > 0);
7759 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7760 
7761 	ifp->if_refio--;
7762 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7763 
7764 	/*
7765 	 * if there are no more outstanding io references, wakeup the
7766 	 * ifnet_detach thread if detaching flag is set.
7767 	 */
7768 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7769 		wakeup(&(ifp->if_refio));
7770 	}
7771 }
7772 
7773 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7774 ifnet_decr_iorefcnt(struct ifnet *ifp)
7775 {
7776 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7777 	ifnet_decr_iorefcnt_locked(ifp);
7778 	lck_mtx_unlock(&ifp->if_ref_lock);
7779 }
7780 
7781 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)7782 ifnet_datamov_begin(struct ifnet *ifp)
7783 {
7784 	boolean_t ret;
7785 
7786 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7787 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
7788 		ifp->if_refio++;
7789 		ifp->if_datamov++;
7790 	}
7791 	lck_mtx_unlock(&ifp->if_ref_lock);
7792 
7793 	return ret;
7794 }
7795 
7796 void
ifnet_datamov_end(struct ifnet * ifp)7797 ifnet_datamov_end(struct ifnet *ifp)
7798 {
7799 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7800 	VERIFY(ifp->if_datamov > 0);
7801 	/*
7802 	 * if there's no more thread moving data, wakeup any
7803 	 * drainers that's blocked waiting for this.
7804 	 */
7805 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
7806 		wakeup(&(ifp->if_datamov));
7807 	}
7808 	ifnet_decr_iorefcnt_locked(ifp);
7809 	lck_mtx_unlock(&ifp->if_ref_lock);
7810 }
7811 
7812 void
ifnet_datamov_suspend(struct ifnet * ifp)7813 ifnet_datamov_suspend(struct ifnet *ifp)
7814 {
7815 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7816 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7817 	ifp->if_refio++;
7818 	if (ifp->if_suspend++ == 0) {
7819 		VERIFY(ifp->if_refflags & IFRF_READY);
7820 		ifp->if_refflags &= ~IFRF_READY;
7821 	}
7822 	lck_mtx_unlock(&ifp->if_ref_lock);
7823 }
7824 
7825 void
ifnet_datamov_drain(struct ifnet * ifp)7826 ifnet_datamov_drain(struct ifnet *ifp)
7827 {
7828 	lck_mtx_lock(&ifp->if_ref_lock);
7829 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7830 	/* data movement must already be suspended */
7831 	VERIFY(ifp->if_suspend > 0);
7832 	VERIFY(!(ifp->if_refflags & IFRF_READY));
7833 	ifp->if_drainers++;
7834 	while (ifp->if_datamov != 0) {
7835 #if SKYWALK
7836 		SK_ERR("Waiting for data path(s) to quiesce on %s",
7837 		    if_name(ifp));
7838 #endif /* SKYWALK */
7839 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
7840 		    (PZERO - 1), __func__, NULL);
7841 	}
7842 	VERIFY(!(ifp->if_refflags & IFRF_READY));
7843 	VERIFY(ifp->if_drainers > 0);
7844 	ifp->if_drainers--;
7845 	lck_mtx_unlock(&ifp->if_ref_lock);
7846 
7847 	/* purge the interface queues */
7848 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
7849 		if_qflush_snd(ifp, false);
7850 	}
7851 }
7852 
7853 void
ifnet_datamov_resume(struct ifnet * ifp)7854 ifnet_datamov_resume(struct ifnet *ifp)
7855 {
7856 	lck_mtx_lock(&ifp->if_ref_lock);
7857 	/* data movement must already be suspended */
7858 	VERIFY(ifp->if_suspend > 0);
7859 	if (--ifp->if_suspend == 0) {
7860 		VERIFY(!(ifp->if_refflags & IFRF_READY));
7861 		ifp->if_refflags |= IFRF_READY;
7862 	}
7863 	ifnet_decr_iorefcnt_locked(ifp);
7864 	lck_mtx_unlock(&ifp->if_ref_lock);
7865 }
7866 
7867 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)7868 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
7869 {
7870 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
7871 	ctrace_t *tr;
7872 	u_int32_t idx;
7873 	u_int16_t *cnt;
7874 
7875 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
7876 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
7877 		/* NOTREACHED */
7878 	}
7879 
7880 	if (refhold) {
7881 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
7882 		tr = dl_if_dbg->dldbg_if_refhold;
7883 	} else {
7884 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
7885 		tr = dl_if_dbg->dldbg_if_refrele;
7886 	}
7887 
7888 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
7889 	ctrace_record(&tr[idx]);
7890 }
7891 
7892 errno_t
dlil_if_ref(struct ifnet * ifp)7893 dlil_if_ref(struct ifnet *ifp)
7894 {
7895 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7896 
7897 	if (dl_if == NULL) {
7898 		return EINVAL;
7899 	}
7900 
7901 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
7902 	++dl_if->dl_if_refcnt;
7903 	if (dl_if->dl_if_refcnt == 0) {
7904 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
7905 		/* NOTREACHED */
7906 	}
7907 	if (dl_if->dl_if_trace != NULL) {
7908 		(*dl_if->dl_if_trace)(dl_if, TRUE);
7909 	}
7910 	lck_mtx_unlock(&dl_if->dl_if_lock);
7911 
7912 	return 0;
7913 }
7914 
7915 errno_t
dlil_if_free(struct ifnet * ifp)7916 dlil_if_free(struct ifnet *ifp)
7917 {
7918 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7919 	bool need_release = FALSE;
7920 
7921 	if (dl_if == NULL) {
7922 		return EINVAL;
7923 	}
7924 
7925 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
7926 	switch (dl_if->dl_if_refcnt) {
7927 	case 0:
7928 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
7929 		/* NOTREACHED */
7930 		break;
7931 	case 1:
7932 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
7933 			need_release = TRUE;
7934 		}
7935 		break;
7936 	default:
7937 		break;
7938 	}
7939 	--dl_if->dl_if_refcnt;
7940 	if (dl_if->dl_if_trace != NULL) {
7941 		(*dl_if->dl_if_trace)(dl_if, FALSE);
7942 	}
7943 	lck_mtx_unlock(&dl_if->dl_if_lock);
7944 	if (need_release) {
7945 		_dlil_if_release(ifp, true);
7946 	}
7947 	return 0;
7948 }
7949 
7950 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)7951 dlil_attach_protocol(struct if_proto *proto,
7952     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
7953     uint32_t * proto_count)
7954 {
7955 	struct kev_dl_proto_data ev_pr_data;
7956 	struct ifnet *ifp = proto->ifp;
7957 	errno_t retval = 0;
7958 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
7959 	struct if_proto *prev_proto;
7960 	struct if_proto *_proto;
7961 
7962 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
7963 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
7964 		return EINVAL;
7965 	}
7966 
7967 	if (!ifnet_is_attached(ifp, 1)) {
7968 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
7969 		    __func__, if_name(ifp));
7970 		return ENXIO;
7971 	}
7972 	/* callee holds a proto refcnt upon success */
7973 	ifnet_lock_exclusive(ifp);
7974 	_proto = find_attached_proto(ifp, proto->protocol_family);
7975 	if (_proto != NULL) {
7976 		ifnet_lock_done(ifp);
7977 		if_proto_free(_proto);
7978 		retval = EEXIST;
7979 		goto ioref_done;
7980 	}
7981 
7982 	/*
7983 	 * Call family module add_proto routine so it can refine the
7984 	 * demux descriptors as it wishes.
7985 	 */
7986 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
7987 	    demux_count);
7988 	if (retval) {
7989 		ifnet_lock_done(ifp);
7990 		goto ioref_done;
7991 	}
7992 
7993 	/*
7994 	 * Insert the protocol in the hash
7995 	 */
7996 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
7997 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
7998 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
7999 	}
8000 	if (prev_proto) {
8001 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8002 	} else {
8003 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8004 		    proto, next_hash);
8005 	}
8006 
8007 	/* hold a proto refcnt for attach */
8008 	if_proto_ref(proto);
8009 
8010 	/*
8011 	 * The reserved field carries the number of protocol still attached
8012 	 * (subject to change)
8013 	 */
8014 	ev_pr_data.proto_family = proto->protocol_family;
8015 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8016 
8017 	ifnet_lock_done(ifp);
8018 
8019 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8020 	    (struct net_event_data *)&ev_pr_data,
8021 	    sizeof(struct kev_dl_proto_data));
8022 	if (proto_count != NULL) {
8023 		*proto_count = ev_pr_data.proto_remaining_count;
8024 	}
8025 ioref_done:
8026 	ifnet_decr_iorefcnt(ifp);
8027 	return retval;
8028 }
8029 
8030 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8031 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8032 {
8033 	/*
8034 	 * A protocol has been attached, mark the interface up.
8035 	 * This used to be done by configd.KernelEventMonitor, but that
8036 	 * is inherently prone to races (rdar://problem/30810208).
8037 	 */
8038 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8039 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8040 	dlil_post_sifflags_msg(ifp);
8041 #if SKYWALK
8042 	switch (protocol) {
8043 	case AF_INET:
8044 	case AF_INET6:
8045 		/* don't attach the flowswitch unless attaching IP */
8046 		dlil_attach_flowswitch_nexus(ifp);
8047 		break;
8048 	default:
8049 		break;
8050 	}
8051 #endif /* SKYWALK */
8052 }
8053 
8054 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8055 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8056     const struct ifnet_attach_proto_param *proto_details)
8057 {
8058 	int retval = 0;
8059 	struct if_proto  *ifproto = NULL;
8060 	uint32_t proto_count = 0;
8061 
8062 	ifnet_head_lock_shared();
8063 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8064 		retval = EINVAL;
8065 		goto end;
8066 	}
8067 	/* Check that the interface is in the global list */
8068 	if (!ifnet_lookup(ifp)) {
8069 		retval = ENXIO;
8070 		goto end;
8071 	}
8072 
8073 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8074 
8075 	/* refcnt held above during lookup */
8076 	ifproto->ifp = ifp;
8077 	ifproto->protocol_family = protocol;
8078 	ifproto->proto_kpi = kProtoKPI_v1;
8079 	ifproto->kpi.v1.input = proto_details->input;
8080 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8081 	ifproto->kpi.v1.event = proto_details->event;
8082 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8083 	ifproto->kpi.v1.detached = proto_details->detached;
8084 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8085 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8086 
8087 	retval = dlil_attach_protocol(ifproto,
8088 	    proto_details->demux_list, proto_details->demux_count,
8089 	    &proto_count);
8090 
8091 end:
8092 	if (retval == EEXIST) {
8093 		/* already attached */
8094 		if (dlil_verbose) {
8095 			DLIL_PRINTF("%s: protocol %d already attached\n",
8096 			    ifp != NULL ? if_name(ifp) : "N/A",
8097 			    protocol);
8098 		}
8099 	} else if (retval != 0) {
8100 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8101 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8102 	} else if (dlil_verbose) {
8103 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8104 		    ifp != NULL ? if_name(ifp) : "N/A",
8105 		    protocol, proto_count);
8106 	}
8107 	ifnet_head_done();
8108 	if (retval == 0) {
8109 		dlil_handle_proto_attach(ifp, protocol);
8110 	} else if (ifproto != NULL) {
8111 		zfree(dlif_proto_zone, ifproto);
8112 	}
8113 	return retval;
8114 }
8115 
8116 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8117 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8118     const struct ifnet_attach_proto_param_v2 *proto_details)
8119 {
8120 	int retval = 0;
8121 	struct if_proto  *ifproto = NULL;
8122 	uint32_t proto_count = 0;
8123 
8124 	ifnet_head_lock_shared();
8125 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8126 		retval = EINVAL;
8127 		goto end;
8128 	}
8129 	/* Check that the interface is in the global list */
8130 	if (!ifnet_lookup(ifp)) {
8131 		retval = ENXIO;
8132 		goto end;
8133 	}
8134 
8135 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8136 
8137 	/* refcnt held above during lookup */
8138 	ifproto->ifp = ifp;
8139 	ifproto->protocol_family = protocol;
8140 	ifproto->proto_kpi = kProtoKPI_v2;
8141 	ifproto->kpi.v2.input = proto_details->input;
8142 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8143 	ifproto->kpi.v2.event = proto_details->event;
8144 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8145 	ifproto->kpi.v2.detached = proto_details->detached;
8146 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8147 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8148 
8149 	retval = dlil_attach_protocol(ifproto,
8150 	    proto_details->demux_list, proto_details->demux_count,
8151 	    &proto_count);
8152 
8153 end:
8154 	if (retval == EEXIST) {
8155 		/* already attached */
8156 		if (dlil_verbose) {
8157 			DLIL_PRINTF("%s: protocol %d already attached\n",
8158 			    ifp != NULL ? if_name(ifp) : "N/A",
8159 			    protocol);
8160 		}
8161 	} else if (retval != 0) {
8162 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8163 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8164 	} else if (dlil_verbose) {
8165 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8166 		    ifp != NULL ? if_name(ifp) : "N/A",
8167 		    protocol, proto_count);
8168 	}
8169 	ifnet_head_done();
8170 	if (retval == 0) {
8171 		dlil_handle_proto_attach(ifp, protocol);
8172 	} else if (ifproto != NULL) {
8173 		zfree(dlif_proto_zone, ifproto);
8174 	}
8175 	return retval;
8176 }
8177 
8178 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8179 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8180 {
8181 	struct if_proto *proto = NULL;
8182 	int     retval = 0;
8183 
8184 	if (ifp == NULL || proto_family == 0) {
8185 		retval = EINVAL;
8186 		goto end;
8187 	}
8188 
8189 	ifnet_lock_exclusive(ifp);
8190 	/* callee holds a proto refcnt upon success */
8191 	proto = find_attached_proto(ifp, proto_family);
8192 	if (proto == NULL) {
8193 		retval = ENXIO;
8194 		ifnet_lock_done(ifp);
8195 		goto end;
8196 	}
8197 
8198 	/* call family module del_proto */
8199 	if (ifp->if_del_proto) {
8200 		ifp->if_del_proto(ifp, proto->protocol_family);
8201 	}
8202 
8203 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8204 	    proto, if_proto, next_hash);
8205 
8206 	if (proto->proto_kpi == kProtoKPI_v1) {
8207 		proto->kpi.v1.input = ifproto_media_input_v1;
8208 		proto->kpi.v1.pre_output = ifproto_media_preout;
8209 		proto->kpi.v1.event = ifproto_media_event;
8210 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8211 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8212 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8213 	} else {
8214 		proto->kpi.v2.input = ifproto_media_input_v2;
8215 		proto->kpi.v2.pre_output = ifproto_media_preout;
8216 		proto->kpi.v2.event = ifproto_media_event;
8217 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8218 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8219 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8220 	}
8221 	proto->detached = 1;
8222 	ifnet_lock_done(ifp);
8223 
8224 	if (dlil_verbose) {
8225 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8226 		    (proto->proto_kpi == kProtoKPI_v1) ?
8227 		    "v1" : "v2", proto_family);
8228 	}
8229 
8230 	/* release proto refcnt held during protocol attach */
8231 	if_proto_free(proto);
8232 
8233 	/*
8234 	 * Release proto refcnt held during lookup; the rest of
8235 	 * protocol detach steps will happen when the last proto
8236 	 * reference is released.
8237 	 */
8238 	if_proto_free(proto);
8239 
8240 end:
8241 	return retval;
8242 }
8243 
8244 
8245 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8246 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8247     struct mbuf *packet, char *header)
8248 {
8249 #pragma unused(ifp, protocol, packet, header)
8250 	return ENXIO;
8251 }
8252 
8253 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8254 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8255     struct mbuf *packet)
8256 {
8257 #pragma unused(ifp, protocol, packet)
8258 	return ENXIO;
8259 }
8260 
8261 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8262 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8263     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8264     char *link_layer_dest)
8265 {
8266 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8267 	return ENXIO;
8268 }
8269 
8270 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8271 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8272     const struct kev_msg *event)
8273 {
8274 #pragma unused(ifp, protocol, event)
8275 }
8276 
8277 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8278 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8279     unsigned long command, void *argument)
8280 {
8281 #pragma unused(ifp, protocol, command, argument)
8282 	return ENXIO;
8283 }
8284 
8285 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8286 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8287     struct sockaddr_dl *out_ll, size_t ll_len)
8288 {
8289 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8290 	return ENXIO;
8291 }
8292 
8293 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8294 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8295     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8296     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8297 {
8298 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8299 	return ENXIO;
8300 }
8301 
8302 extern int if_next_index(void);
8303 extern int tcp_ecn_outbound;
8304 
8305 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8306 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8307 {
8308 	uint32_t sflags = 0;
8309 	int err;
8310 
8311 	if (if_flowadv) {
8312 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8313 	}
8314 
8315 	if (if_delaybased_queue) {
8316 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8317 	}
8318 
8319 	if (ifp->if_output_sched_model ==
8320 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8321 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8322 	}
8323 	/* Inherit drop limit from the default queue */
8324 	if (ifp->if_snd != ifcq) {
8325 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8326 	}
8327 	/* Initialize transmit queue(s) */
8328 	err = ifclassq_setup(ifcq, ifp, sflags);
8329 	if (err != 0) {
8330 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8331 		    "err=%d", __func__, ifp, err);
8332 		/* NOTREACHED */
8333 	}
8334 }
8335 
8336 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8337 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8338 {
8339 #if SKYWALK
8340 	boolean_t netif_compat;
8341 	if_nexus_netif  nexus_netif;
8342 #endif /* SKYWALK */
8343 	struct ifnet *tmp_if;
8344 	struct ifaddr *ifa;
8345 	struct if_data_internal if_data_saved;
8346 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8347 	struct dlil_threading_info *dl_inp;
8348 	thread_continue_t thfunc = NULL;
8349 	int err;
8350 
8351 	if (ifp == NULL) {
8352 		return EINVAL;
8353 	}
8354 
8355 	/*
8356 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8357 	 * prevent the interface from being configured while it is
8358 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8359 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8360 	 */
8361 	dlil_if_lock();
8362 	ifnet_head_lock_exclusive();
8363 	/* Verify we aren't already on the list */
8364 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8365 		if (tmp_if == ifp) {
8366 			ifnet_head_done();
8367 			dlil_if_unlock();
8368 			return EEXIST;
8369 		}
8370 	}
8371 
8372 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8373 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8374 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8375 		    __func__, ifp);
8376 		/* NOTREACHED */
8377 	}
8378 	lck_mtx_unlock(&ifp->if_ref_lock);
8379 
8380 	ifnet_lock_exclusive(ifp);
8381 
8382 	/* Sanity check */
8383 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8384 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8385 	VERIFY(ifp->if_threads_pending == 0);
8386 
8387 	if (ll_addr != NULL) {
8388 		if (ifp->if_addrlen == 0) {
8389 			ifp->if_addrlen = ll_addr->sdl_alen;
8390 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8391 			ifnet_lock_done(ifp);
8392 			ifnet_head_done();
8393 			dlil_if_unlock();
8394 			return EINVAL;
8395 		}
8396 	}
8397 
8398 	/*
8399 	 * Allow interfaces without protocol families to attach
8400 	 * only if they have the necessary fields filled out.
8401 	 */
8402 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8403 		DLIL_PRINTF("%s: Attempt to attach interface without "
8404 		    "family module - %d\n", __func__, ifp->if_family);
8405 		ifnet_lock_done(ifp);
8406 		ifnet_head_done();
8407 		dlil_if_unlock();
8408 		return ENODEV;
8409 	}
8410 
8411 	/* Allocate protocol hash table */
8412 	VERIFY(ifp->if_proto_hash == NULL);
8413 	ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8414 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
8415 
8416 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8417 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8418 	TAILQ_INIT(&ifp->if_flt_head);
8419 	VERIFY(ifp->if_flt_busy == 0);
8420 	VERIFY(ifp->if_flt_waiters == 0);
8421 	VERIFY(ifp->if_flt_non_os_count == 0);
8422 	VERIFY(ifp->if_flt_no_tso_count == 0);
8423 	lck_mtx_unlock(&ifp->if_flt_lock);
8424 
8425 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8426 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8427 		LIST_INIT(&ifp->if_multiaddrs);
8428 	}
8429 
8430 	VERIFY(ifp->if_allhostsinm == NULL);
8431 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8432 	TAILQ_INIT(&ifp->if_addrhead);
8433 
8434 	if (ifp->if_index == 0) {
8435 		int idx = if_next_index();
8436 
8437 		if (idx == -1) {
8438 			ifp->if_index = 0;
8439 			ifnet_lock_done(ifp);
8440 			ifnet_head_done();
8441 			dlil_if_unlock();
8442 			return ENOBUFS;
8443 		}
8444 		ifp->if_index = (uint16_t)idx;
8445 
8446 		/* the lladdr passed at attach time is the permanent address */
8447 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8448 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8449 			bcopy(CONST_LLADDR(ll_addr),
8450 			    dl_if->dl_if_permanent_ether,
8451 			    ETHER_ADDR_LEN);
8452 			dl_if->dl_if_permanent_ether_is_set = 1;
8453 		}
8454 	}
8455 	/* There should not be anything occupying this slot */
8456 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8457 
8458 	/* allocate (if needed) and initialize a link address */
8459 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8460 	if (ifa == NULL) {
8461 		ifnet_lock_done(ifp);
8462 		ifnet_head_done();
8463 		dlil_if_unlock();
8464 		return ENOBUFS;
8465 	}
8466 
8467 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8468 	ifnet_addrs[ifp->if_index - 1] = ifa;
8469 
8470 	/* make this address the first on the list */
8471 	IFA_LOCK(ifa);
8472 	/* hold a reference for ifnet_addrs[] */
8473 	IFA_ADDREF_LOCKED(ifa);
8474 	/* if_attach_link_ifa() holds a reference for ifa_link */
8475 	if_attach_link_ifa(ifp, ifa);
8476 	IFA_UNLOCK(ifa);
8477 
8478 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8479 	ifindex2ifnet[ifp->if_index] = ifp;
8480 
8481 	/* Hold a reference to the underlying dlil_ifnet */
8482 	ifnet_reference(ifp);
8483 
8484 	/* Clear stats (save and restore other fields that we care) */
8485 	if_data_saved = ifp->if_data;
8486 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8487 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8488 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8489 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8490 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8491 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8492 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8493 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8494 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8495 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8496 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8497 	ifnet_touch_lastchange(ifp);
8498 
8499 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8500 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8501 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8502 
8503 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8504 
8505 	/* Sanity checks on the input thread storage */
8506 	dl_inp = &dl_if->dl_if_inpstorage;
8507 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8508 	VERIFY(dl_inp->dlth_flags == 0);
8509 	VERIFY(dl_inp->dlth_wtot == 0);
8510 	VERIFY(dl_inp->dlth_ifp == NULL);
8511 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8512 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8513 	VERIFY(!dl_inp->dlth_affinity);
8514 	VERIFY(ifp->if_inp == NULL);
8515 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8516 	VERIFY(dl_inp->dlth_strategy == NULL);
8517 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8518 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8519 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8520 
8521 #if IFNET_INPUT_SANITY_CHK
8522 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8523 #endif /* IFNET_INPUT_SANITY_CHK */
8524 
8525 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8526 	dlil_reset_rxpoll_params(ifp);
8527 	/*
8528 	 * A specific DLIL input thread is created per non-loopback interface.
8529 	 */
8530 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8531 		ifp->if_inp = dl_inp;
8532 		ifnet_incr_pending_thread_count(ifp);
8533 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8534 		if (err == ENODEV) {
8535 			VERIFY(thfunc == NULL);
8536 			ifnet_decr_pending_thread_count(ifp);
8537 		} else if (err != 0) {
8538 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8539 			    "err=%d", __func__, ifp, err);
8540 			/* NOTREACHED */
8541 		}
8542 	}
8543 	/*
8544 	 * If the driver supports the new transmit model, calculate flow hash
8545 	 * and create a workloop starter thread to invoke the if_start callback
8546 	 * where the packets may be dequeued and transmitted.
8547 	 */
8548 	if (ifp->if_eflags & IFEF_TXSTART) {
8549 		thread_precedence_policy_data_t info;
8550 		__unused kern_return_t kret;
8551 
8552 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8553 		VERIFY(ifp->if_flowhash != 0);
8554 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8555 
8556 		ifnet_set_start_cycle(ifp, NULL);
8557 		ifp->if_start_active = 0;
8558 		ifp->if_start_req = 0;
8559 		ifp->if_start_flags = 0;
8560 		VERIFY(ifp->if_start != NULL);
8561 		ifnet_incr_pending_thread_count(ifp);
8562 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8563 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8564 			panic_plain("%s: "
8565 			    "ifp=%p couldn't get a start thread; "
8566 			    "err=%d", __func__, ifp, err);
8567 			/* NOTREACHED */
8568 		}
8569 		bzero(&info, sizeof(info));
8570 		info.importance = 1;
8571 		kret = thread_policy_set(ifp->if_start_thread,
8572 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8573 		    THREAD_PRECEDENCE_POLICY_COUNT);
8574 		ASSERT(kret == KERN_SUCCESS);
8575 	} else {
8576 		ifp->if_flowhash = 0;
8577 	}
8578 
8579 	/* Reset polling parameters */
8580 	ifnet_set_poll_cycle(ifp, NULL);
8581 	ifp->if_poll_update = 0;
8582 	ifp->if_poll_flags = 0;
8583 	ifp->if_poll_req = 0;
8584 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8585 
8586 	/*
8587 	 * If the driver supports the new receive model, create a poller
8588 	 * thread to invoke if_input_poll callback where the packets may
8589 	 * be dequeued from the driver and processed for reception.
8590 	 * if the interface is netif compat then the poller thread is
8591 	 * managed by netif.
8592 	 */
8593 	if (thfunc == dlil_rxpoll_input_thread_func) {
8594 		thread_precedence_policy_data_t info;
8595 		__unused kern_return_t kret;
8596 #if SKYWALK
8597 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8598 #endif /* SKYWALK */
8599 		VERIFY(ifp->if_input_poll != NULL);
8600 		VERIFY(ifp->if_input_ctl != NULL);
8601 		ifnet_incr_pending_thread_count(ifp);
8602 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8603 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8604 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8605 			    "err=%d", __func__, ifp, err);
8606 			/* NOTREACHED */
8607 		}
8608 		bzero(&info, sizeof(info));
8609 		info.importance = 1;
8610 		kret = thread_policy_set(ifp->if_poll_thread,
8611 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8612 		    THREAD_PRECEDENCE_POLICY_COUNT);
8613 		ASSERT(kret == KERN_SUCCESS);
8614 	}
8615 
8616 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8617 	VERIFY(ifp->if_desc.ifd_len == 0);
8618 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8619 
8620 	/* Record attach PC stacktrace */
8621 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8622 
8623 	ifp->if_updatemcasts = 0;
8624 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8625 		struct ifmultiaddr *ifma;
8626 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8627 			IFMA_LOCK(ifma);
8628 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8629 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8630 				ifp->if_updatemcasts++;
8631 			}
8632 			IFMA_UNLOCK(ifma);
8633 		}
8634 
8635 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8636 		    "membership(s)\n", if_name(ifp),
8637 		    ifp->if_updatemcasts);
8638 	}
8639 
8640 	/* Clear logging parameters */
8641 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8642 
8643 	/* Clear foreground/realtime activity timestamps */
8644 	ifp->if_fg_sendts = 0;
8645 	ifp->if_rt_sendts = 0;
8646 
8647 	/* Clear throughput estimates and radio type */
8648 	ifp->if_estimated_up_bucket = 0;
8649 	ifp->if_estimated_down_bucket = 0;
8650 	ifp->if_radio_type = 0;
8651 	ifp->if_radio_channel = 0;
8652 
8653 	VERIFY(ifp->if_delegated.ifp == NULL);
8654 	VERIFY(ifp->if_delegated.type == 0);
8655 	VERIFY(ifp->if_delegated.family == 0);
8656 	VERIFY(ifp->if_delegated.subfamily == 0);
8657 	VERIFY(ifp->if_delegated.expensive == 0);
8658 	VERIFY(ifp->if_delegated.constrained == 0);
8659 
8660 	VERIFY(ifp->if_agentids == NULL);
8661 	VERIFY(ifp->if_agentcount == 0);
8662 
8663 	/* Reset interface state */
8664 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8665 	ifp->if_interface_state.valid_bitmask |=
8666 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8667 	ifp->if_interface_state.interface_availability =
8668 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8669 
8670 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8671 	if (ifp == lo_ifp) {
8672 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8673 		ifp->if_interface_state.valid_bitmask |=
8674 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8675 	} else {
8676 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8677 	}
8678 
8679 	/*
8680 	 * Enable ECN capability on this interface depending on the
8681 	 * value of ECN global setting
8682 	 */
8683 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8684 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8685 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8686 	}
8687 
8688 	/*
8689 	 * Built-in Cyclops always on policy for WiFi infra
8690 	 */
8691 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8692 		errno_t error;
8693 
8694 		error = if_set_qosmarking_mode(ifp,
8695 		    IFRTYPE_QOSMARKING_FASTLANE);
8696 		if (error != 0) {
8697 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8698 			    __func__, ifp->if_xname, error);
8699 		} else {
8700 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8701 #if (DEVELOPMENT || DEBUG)
8702 			DLIL_PRINTF("%s fastlane enabled on %s\n",
8703 			    __func__, ifp->if_xname);
8704 #endif /* (DEVELOPMENT || DEBUG) */
8705 		}
8706 	}
8707 
8708 	ifnet_lock_done(ifp);
8709 	ifnet_head_done();
8710 
8711 #if SKYWALK
8712 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8713 #endif /* SKYWALK */
8714 
8715 	lck_mtx_lock(&ifp->if_cached_route_lock);
8716 	/* Enable forwarding cached route */
8717 	ifp->if_fwd_cacheok = 1;
8718 	/* Clean up any existing cached routes */
8719 	ROUTE_RELEASE(&ifp->if_fwd_route);
8720 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8721 	ROUTE_RELEASE(&ifp->if_src_route);
8722 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8723 	ROUTE_RELEASE(&ifp->if_src_route6);
8724 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8725 	lck_mtx_unlock(&ifp->if_cached_route_lock);
8726 
8727 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8728 
8729 	/*
8730 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8731 	 * and trees; do this before the ifnet is marked as attached.
8732 	 * The ifnet keeps the reference to the info structures even after
8733 	 * the ifnet is detached, since the network-layer records still
8734 	 * refer to the info structures even after that.  This also
8735 	 * makes it possible for them to still function after the ifnet
8736 	 * is recycled or reattached.
8737 	 */
8738 #if INET
8739 	if (IGMP_IFINFO(ifp) == NULL) {
8740 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
8741 		VERIFY(IGMP_IFINFO(ifp) != NULL);
8742 	} else {
8743 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
8744 		igmp_domifreattach(IGMP_IFINFO(ifp));
8745 	}
8746 #endif /* INET */
8747 	if (MLD_IFINFO(ifp) == NULL) {
8748 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
8749 		VERIFY(MLD_IFINFO(ifp) != NULL);
8750 	} else {
8751 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
8752 		mld_domifreattach(MLD_IFINFO(ifp));
8753 	}
8754 
8755 	VERIFY(ifp->if_data_threshold == 0);
8756 	VERIFY(ifp->if_dt_tcall != NULL);
8757 
8758 	/*
8759 	 * Wait for the created kernel threads for I/O to get
8760 	 * scheduled and run at least once before we proceed
8761 	 * to mark interface as attached.
8762 	 */
8763 	lck_mtx_lock(&ifp->if_ref_lock);
8764 	while (ifp->if_threads_pending != 0) {
8765 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
8766 		    "interface %s to get scheduled at least once.\n",
8767 		    __func__, ifp->if_xname);
8768 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
8769 		    __func__, NULL);
8770 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
8771 	}
8772 	lck_mtx_unlock(&ifp->if_ref_lock);
8773 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
8774 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
8775 
8776 	/* Final mark this ifnet as attached. */
8777 	ifnet_lock_exclusive(ifp);
8778 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8779 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
8780 	lck_mtx_unlock(&ifp->if_ref_lock);
8781 	if (net_rtref) {
8782 		/* boot-args override; enable idle notification */
8783 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
8784 		    IFRF_IDLE_NOTIFY);
8785 	} else {
8786 		/* apply previous request(s) to set the idle flags, if any */
8787 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
8788 		    ifp->if_idle_new_flags_mask);
8789 	}
8790 #if SKYWALK
8791 	/* the interface is fully attached; let the nexus adapter know */
8792 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
8793 		if (netif_compat) {
8794 			if (sk_netif_compat_txmodel ==
8795 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
8796 				ifnet_enqueue_multi_setup(ifp,
8797 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
8798 			}
8799 			ifp->if_nx_netif = nexus_netif;
8800 		}
8801 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
8802 	}
8803 #endif /* SKYWALK */
8804 	ifnet_lock_done(ifp);
8805 	dlil_if_unlock();
8806 
8807 #if PF
8808 	/*
8809 	 * Attach packet filter to this interface, if enabled.
8810 	 */
8811 	pf_ifnet_hook(ifp, 1);
8812 #endif /* PF */
8813 
8814 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
8815 
8816 	if (dlil_verbose) {
8817 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
8818 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
8819 	}
8820 
8821 	return 0;
8822 }
8823 
8824 /*
8825  * Prepare the storage for the first/permanent link address, which must
8826  * must have the same lifetime as the ifnet itself.  Although the link
8827  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
8828  * its location in memory must never change as it may still be referred
8829  * to by some parts of the system afterwards (unfortunate implementation
8830  * artifacts inherited from BSD.)
8831  *
8832  * Caller must hold ifnet lock as writer.
8833  */
8834 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)8835 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
8836 {
8837 	struct ifaddr *ifa, *oifa;
8838 	struct sockaddr_dl *asdl, *msdl;
8839 	char workbuf[IFNAMSIZ * 2];
8840 	int namelen, masklen, socksize;
8841 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8842 
8843 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
8844 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
8845 
8846 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
8847 	    if_name(ifp));
8848 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
8849 	    + ((namelen > 0) ? namelen : 0);
8850 	socksize = masklen + ifp->if_addrlen;
8851 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
8852 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
8853 		socksize = sizeof(struct sockaddr_dl);
8854 	}
8855 	socksize = ROUNDUP(socksize);
8856 #undef ROUNDUP
8857 
8858 	ifa = ifp->if_lladdr;
8859 	if (socksize > DLIL_SDLMAXLEN ||
8860 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
8861 		/*
8862 		 * Rare, but in the event that the link address requires
8863 		 * more storage space than DLIL_SDLMAXLEN, allocate the
8864 		 * largest possible storages for address and mask, such
8865 		 * that we can reuse the same space when if_addrlen grows.
8866 		 * This same space will be used when if_addrlen shrinks.
8867 		 */
8868 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
8869 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
8870 			ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
8871 			if (ifa == NULL) {
8872 				return NULL;
8873 			}
8874 			ifa_lock_init(ifa);
8875 			/* Don't set IFD_ALLOC, as this is permanent */
8876 			ifa->ifa_debug = IFD_LINK;
8877 		}
8878 		IFA_LOCK(ifa);
8879 		/* address and mask sockaddr_dl locations */
8880 		asdl = (struct sockaddr_dl *)(ifa + 1);
8881 		bzero(asdl, SOCK_MAXADDRLEN);
8882 		msdl = (struct sockaddr_dl *)(void *)
8883 		    ((char *)asdl + SOCK_MAXADDRLEN);
8884 		bzero(msdl, SOCK_MAXADDRLEN);
8885 	} else {
8886 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
8887 		/*
8888 		 * Use the storage areas for address and mask within the
8889 		 * dlil_ifnet structure.  This is the most common case.
8890 		 */
8891 		if (ifa == NULL) {
8892 			ifa = &dl_if->dl_if_lladdr.ifa;
8893 			ifa_lock_init(ifa);
8894 			/* Don't set IFD_ALLOC, as this is permanent */
8895 			ifa->ifa_debug = IFD_LINK;
8896 		}
8897 		IFA_LOCK(ifa);
8898 		/* address and mask sockaddr_dl locations */
8899 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
8900 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
8901 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
8902 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
8903 	}
8904 
8905 	/* hold a permanent reference for the ifnet itself */
8906 	IFA_ADDREF_LOCKED(ifa);
8907 	oifa = ifp->if_lladdr;
8908 	ifp->if_lladdr = ifa;
8909 
8910 	VERIFY(ifa->ifa_debug == IFD_LINK);
8911 	ifa->ifa_ifp = ifp;
8912 	ifa->ifa_rtrequest = link_rtrequest;
8913 	ifa->ifa_addr = (struct sockaddr *)asdl;
8914 	asdl->sdl_len = (u_char)socksize;
8915 	asdl->sdl_family = AF_LINK;
8916 	if (namelen > 0) {
8917 		bcopy(workbuf, asdl->sdl_data, min(namelen,
8918 		    sizeof(asdl->sdl_data)));
8919 		asdl->sdl_nlen = (u_char)namelen;
8920 	} else {
8921 		asdl->sdl_nlen = 0;
8922 	}
8923 	asdl->sdl_index = ifp->if_index;
8924 	asdl->sdl_type = ifp->if_type;
8925 	if (ll_addr != NULL) {
8926 		asdl->sdl_alen = ll_addr->sdl_alen;
8927 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
8928 	} else {
8929 		asdl->sdl_alen = 0;
8930 	}
8931 	ifa->ifa_netmask = (struct sockaddr *)msdl;
8932 	msdl->sdl_len = (u_char)masklen;
8933 	while (namelen > 0) {
8934 		msdl->sdl_data[--namelen] = 0xff;
8935 	}
8936 	IFA_UNLOCK(ifa);
8937 
8938 	if (oifa != NULL) {
8939 		IFA_REMREF(oifa);
8940 	}
8941 
8942 	return ifa;
8943 }
8944 
8945 static void
if_purgeaddrs(struct ifnet * ifp)8946 if_purgeaddrs(struct ifnet *ifp)
8947 {
8948 #if INET
8949 	in_purgeaddrs(ifp);
8950 #endif /* INET */
8951 	in6_purgeaddrs(ifp);
8952 }
8953 
8954 errno_t
ifnet_detach(ifnet_t ifp)8955 ifnet_detach(ifnet_t ifp)
8956 {
8957 	struct ifnet *delegated_ifp;
8958 	struct nd_ifinfo *ndi = NULL;
8959 #if SKYWALK
8960 	if_nexus_netif nexus_netif;
8961 	if_nexus_flowswitch nexus_fsw;
8962 #endif /* SKYWALK */
8963 
8964 	if (ifp == NULL) {
8965 		return EINVAL;
8966 	}
8967 
8968 	ndi = ND_IFINFO(ifp);
8969 	if (NULL != ndi) {
8970 		ndi->cga_initialized = FALSE;
8971 	}
8972 
8973 	/* Mark the interface down */
8974 	if_down(ifp);
8975 
8976 	/*
8977 	 * IMPORTANT NOTE
8978 	 *
8979 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
8980 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
8981 	 * until after we've waited for all I/O references to drain
8982 	 * in ifnet_detach_final().
8983 	 */
8984 
8985 	ifnet_head_lock_exclusive();
8986 	ifnet_lock_exclusive(ifp);
8987 
8988 	if (ifp->if_output_netem != NULL) {
8989 		netem_destroy(ifp->if_output_netem);
8990 		ifp->if_output_netem = NULL;
8991 	}
8992 
8993 	/*
8994 	 * Check to see if this interface has previously triggered
8995 	 * aggressive protocol draining; if so, decrement the global
8996 	 * refcnt and clear PR_AGGDRAIN on the route domain if
8997 	 * there are no more of such an interface around.
8998 	 */
8999 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9000 
9001 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9002 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9003 		lck_mtx_unlock(&ifp->if_ref_lock);
9004 		ifnet_lock_done(ifp);
9005 		ifnet_head_done();
9006 		return EINVAL;
9007 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9008 		/* Interface has already been detached */
9009 		lck_mtx_unlock(&ifp->if_ref_lock);
9010 		ifnet_lock_done(ifp);
9011 		ifnet_head_done();
9012 		return ENXIO;
9013 	}
9014 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9015 	/* Indicate this interface is being detached */
9016 	ifp->if_refflags &= ~IFRF_ATTACHED;
9017 	ifp->if_refflags |= IFRF_DETACHING;
9018 	lck_mtx_unlock(&ifp->if_ref_lock);
9019 
9020 	if (dlil_verbose) {
9021 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9022 	}
9023 
9024 	/* clean up flow control entry object if there's any */
9025 	if (ifp->if_eflags & IFEF_TXSTART) {
9026 		ifnet_flowadv(ifp->if_flowhash);
9027 	}
9028 
9029 	/* Reset ECN enable/disable flags */
9030 	/* Reset CLAT46 flag */
9031 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9032 
9033 	/*
9034 	 * We do not reset the TCP keep alive counters in case
9035 	 * a TCP connection stays connection after the interface
9036 	 * went down
9037 	 */
9038 	if (ifp->if_tcp_kao_cnt > 0) {
9039 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9040 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9041 	}
9042 	ifp->if_tcp_kao_max = 0;
9043 
9044 	/*
9045 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9046 	 * no longer be visible during lookups from this point.
9047 	 */
9048 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9049 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9050 	ifp->if_link.tqe_next = NULL;
9051 	ifp->if_link.tqe_prev = NULL;
9052 	if (ifp->if_ordered_link.tqe_next != NULL ||
9053 	    ifp->if_ordered_link.tqe_prev != NULL) {
9054 		ifnet_remove_from_ordered_list(ifp);
9055 	}
9056 	ifindex2ifnet[ifp->if_index] = NULL;
9057 
9058 	/* 18717626 - reset router mode */
9059 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9060 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9061 
9062 	/* Record detach PC stacktrace */
9063 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9064 
9065 	/* Clear logging parameters */
9066 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9067 
9068 	/* Clear delegated interface info (reference released below) */
9069 	delegated_ifp = ifp->if_delegated.ifp;
9070 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9071 
9072 	/* Reset interface state */
9073 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9074 
9075 #if SKYWALK
9076 	/* save then clear the nexus configuration */
9077 	nexus_netif = ifp->if_nx_netif;
9078 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
9079 	nexus_fsw = ifp->if_nx_flowswitch;
9080 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
9081 #endif /* SKYWALK */
9082 	ifnet_lock_done(ifp);
9083 	ifnet_head_done();
9084 
9085 #if SKYWALK
9086 	/* detach nexus configuration */
9087 	dlil_detach_flowswitch_nexus(&nexus_fsw);
9088 	dlil_detach_netif_nexus(&nexus_netif);
9089 #endif /* SKYWALK */
9090 
9091 	/* Release reference held on the delegated interface */
9092 	if (delegated_ifp != NULL) {
9093 		ifnet_release(delegated_ifp);
9094 	}
9095 
9096 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9097 	if (ifp != lo_ifp) {
9098 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9099 	}
9100 
9101 	/* Reset TCP local statistics */
9102 	if (ifp->if_tcp_stat != NULL) {
9103 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9104 	}
9105 
9106 	/* Reset UDP local statistics */
9107 	if (ifp->if_udp_stat != NULL) {
9108 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9109 	}
9110 
9111 	/* Reset ifnet IPv4 stats */
9112 	if (ifp->if_ipv4_stat != NULL) {
9113 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9114 	}
9115 
9116 	/* Reset ifnet IPv6 stats */
9117 	if (ifp->if_ipv6_stat != NULL) {
9118 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9119 	}
9120 
9121 	/* Release memory held for interface link status report */
9122 	if (ifp->if_link_status != NULL) {
9123 		kfree_type(struct if_link_status, ifp->if_link_status);
9124 		ifp->if_link_status = NULL;
9125 	}
9126 
9127 	/* Let BPF know we're detaching */
9128 	bpfdetach(ifp);
9129 
9130 	/* Disable forwarding cached route */
9131 	lck_mtx_lock(&ifp->if_cached_route_lock);
9132 	ifp->if_fwd_cacheok = 0;
9133 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9134 
9135 	/* Disable data threshold and wait for any pending event posting */
9136 	ifp->if_data_threshold = 0;
9137 	VERIFY(ifp->if_dt_tcall != NULL);
9138 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9139 
9140 	/*
9141 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9142 	 * references to the info structures and leave them attached to
9143 	 * this ifnet.
9144 	 */
9145 #if INET
9146 	igmp_domifdetach(ifp);
9147 #endif /* INET */
9148 	mld_domifdetach(ifp);
9149 
9150 #if SKYWALK
9151 	/* Clean up any netns tokens still pointing to to this ifnet */
9152 	netns_ifnet_detach(ifp);
9153 #endif /* SKYWALK */
9154 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0);
9155 
9156 	/* Let worker thread take care of the rest, to avoid reentrancy */
9157 	dlil_if_lock();
9158 	ifnet_detaching_enqueue(ifp);
9159 	dlil_if_unlock();
9160 
9161 	return 0;
9162 }
9163 
9164 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9165 ifnet_detaching_enqueue(struct ifnet *ifp)
9166 {
9167 	dlil_if_lock_assert();
9168 
9169 	++ifnet_detaching_cnt;
9170 	VERIFY(ifnet_detaching_cnt != 0);
9171 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9172 	wakeup((caddr_t)&ifnet_delayed_run);
9173 }
9174 
9175 static struct ifnet *
ifnet_detaching_dequeue(void)9176 ifnet_detaching_dequeue(void)
9177 {
9178 	struct ifnet *ifp;
9179 
9180 	dlil_if_lock_assert();
9181 
9182 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9183 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9184 	if (ifp != NULL) {
9185 		VERIFY(ifnet_detaching_cnt != 0);
9186 		--ifnet_detaching_cnt;
9187 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9188 		ifp->if_detaching_link.tqe_next = NULL;
9189 		ifp->if_detaching_link.tqe_prev = NULL;
9190 	}
9191 	return ifp;
9192 }
9193 
9194 __attribute__((noreturn))
9195 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9196 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9197 {
9198 #pragma unused(v, wres)
9199 	struct ifnet *ifp;
9200 
9201 	dlil_if_lock();
9202 	if (__improbable(ifnet_detaching_embryonic)) {
9203 		ifnet_detaching_embryonic = FALSE;
9204 		/* there's no lock ordering constrain so OK to do this here */
9205 		dlil_decr_pending_thread_count();
9206 	}
9207 
9208 	for (;;) {
9209 		dlil_if_lock_assert();
9210 
9211 		if (ifnet_detaching_cnt == 0) {
9212 			break;
9213 		}
9214 
9215 		net_update_uptime();
9216 
9217 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9218 
9219 		/* Take care of detaching ifnet */
9220 		ifp = ifnet_detaching_dequeue();
9221 		if (ifp != NULL) {
9222 			dlil_if_unlock();
9223 			ifnet_detach_final(ifp);
9224 			dlil_if_lock();
9225 		}
9226 	}
9227 
9228 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9229 	dlil_if_unlock();
9230 	(void) thread_block(ifnet_detacher_thread_cont);
9231 
9232 	VERIFY(0);      /* we should never get here */
9233 	/* NOTREACHED */
9234 	__builtin_unreachable();
9235 }
9236 
9237 __dead2
9238 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9239 ifnet_detacher_thread_func(void *v, wait_result_t w)
9240 {
9241 #pragma unused(v, w)
9242 	dlil_if_lock();
9243 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9244 	ifnet_detaching_embryonic = TRUE;
9245 	/* wake up once to get out of embryonic state */
9246 	wakeup((caddr_t)&ifnet_delayed_run);
9247 	dlil_if_unlock();
9248 	(void) thread_block(ifnet_detacher_thread_cont);
9249 	VERIFY(0);
9250 	/* NOTREACHED */
9251 	__builtin_unreachable();
9252 }
9253 
9254 static void
ifnet_detach_final(struct ifnet * ifp)9255 ifnet_detach_final(struct ifnet *ifp)
9256 {
9257 	struct ifnet_filter *filter, *filter_next;
9258 	struct dlil_ifnet *dlifp;
9259 	struct ifnet_filter_head fhead;
9260 	struct dlil_threading_info *inp;
9261 	struct ifaddr *ifa;
9262 	ifnet_detached_func if_free;
9263 	int i;
9264 
9265 	lck_mtx_lock(&ifp->if_ref_lock);
9266 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9267 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9268 		    __func__, ifp);
9269 		/* NOTREACHED */
9270 	}
9271 
9272 	/*
9273 	 * Wait until the existing IO references get released
9274 	 * before we proceed with ifnet_detach.  This is not a
9275 	 * common case, so block without using a continuation.
9276 	 */
9277 	while (ifp->if_refio > 0) {
9278 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9279 		    "to be released\n", __func__, if_name(ifp));
9280 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9281 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9282 	}
9283 
9284 	VERIFY(ifp->if_datamov == 0);
9285 	VERIFY(ifp->if_drainers == 0);
9286 	VERIFY(ifp->if_suspend == 0);
9287 	ifp->if_refflags &= ~IFRF_READY;
9288 	lck_mtx_unlock(&ifp->if_ref_lock);
9289 
9290 	/* Clear agent IDs */
9291 	if (ifp->if_agentids != NULL) {
9292 		kfree_data(ifp->if_agentids,
9293 		    sizeof(uuid_t) * ifp->if_agentcount);
9294 		ifp->if_agentids = NULL;
9295 	}
9296 	ifp->if_agentcount = 0;
9297 
9298 #if SKYWALK
9299 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9300 #endif /* SKYWALK */
9301 	/* Drain and destroy send queue */
9302 	ifclassq_teardown(ifp->if_snd);
9303 
9304 	/* Detach interface filters */
9305 	lck_mtx_lock(&ifp->if_flt_lock);
9306 	if_flt_monitor_enter(ifp);
9307 
9308 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9309 	fhead = ifp->if_flt_head;
9310 	TAILQ_INIT(&ifp->if_flt_head);
9311 
9312 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9313 		filter_next = TAILQ_NEXT(filter, filt_next);
9314 		lck_mtx_unlock(&ifp->if_flt_lock);
9315 
9316 		dlil_detach_filter_internal(filter, 1);
9317 		lck_mtx_lock(&ifp->if_flt_lock);
9318 	}
9319 	if_flt_monitor_leave(ifp);
9320 	lck_mtx_unlock(&ifp->if_flt_lock);
9321 
9322 	/* Tell upper layers to drop their network addresses */
9323 	if_purgeaddrs(ifp);
9324 
9325 	ifnet_lock_exclusive(ifp);
9326 
9327 	/* Unplumb all protocols */
9328 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9329 		struct if_proto *proto;
9330 
9331 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9332 		while (proto != NULL) {
9333 			protocol_family_t family = proto->protocol_family;
9334 			ifnet_lock_done(ifp);
9335 			proto_unplumb(family, ifp);
9336 			ifnet_lock_exclusive(ifp);
9337 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9338 		}
9339 		/* There should not be any protocols left */
9340 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9341 	}
9342 	zfree(dlif_phash_zone, ifp->if_proto_hash);
9343 	ifp->if_proto_hash = NULL;
9344 
9345 	/* Detach (permanent) link address from if_addrhead */
9346 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9347 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9348 	IFA_LOCK(ifa);
9349 	if_detach_link_ifa(ifp, ifa);
9350 	IFA_UNLOCK(ifa);
9351 
9352 	/* Remove (permanent) link address from ifnet_addrs[] */
9353 	IFA_REMREF(ifa);
9354 	ifnet_addrs[ifp->if_index - 1] = NULL;
9355 
9356 	/* This interface should not be on {ifnet_head,detaching} */
9357 	VERIFY(ifp->if_link.tqe_next == NULL);
9358 	VERIFY(ifp->if_link.tqe_prev == NULL);
9359 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9360 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9361 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9362 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9363 
9364 	/* The slot should have been emptied */
9365 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9366 
9367 	/* There should not be any addresses left */
9368 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9369 
9370 	/*
9371 	 * Signal the starter thread to terminate itself, and wait until
9372 	 * it has exited.
9373 	 */
9374 	if (ifp->if_start_thread != THREAD_NULL) {
9375 		lck_mtx_lock_spin(&ifp->if_start_lock);
9376 		ifp->if_start_flags |= IFSF_TERMINATING;
9377 		wakeup_one((caddr_t)&ifp->if_start_thread);
9378 		lck_mtx_unlock(&ifp->if_start_lock);
9379 
9380 		/* wait for starter thread to terminate */
9381 		lck_mtx_lock(&ifp->if_start_lock);
9382 		while (ifp->if_start_thread != THREAD_NULL) {
9383 			if (dlil_verbose) {
9384 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9385 				    __func__,
9386 				    if_name(ifp));
9387 			}
9388 			(void) msleep(&ifp->if_start_thread,
9389 			    &ifp->if_start_lock, (PZERO - 1),
9390 			    "ifnet_start_thread_exit", NULL);
9391 		}
9392 		lck_mtx_unlock(&ifp->if_start_lock);
9393 		if (dlil_verbose) {
9394 			DLIL_PRINTF("%s: %s starter thread termination complete",
9395 			    __func__, if_name(ifp));
9396 		}
9397 	}
9398 
9399 	/*
9400 	 * Signal the poller thread to terminate itself, and wait until
9401 	 * it has exited.
9402 	 */
9403 	if (ifp->if_poll_thread != THREAD_NULL) {
9404 #if SKYWALK
9405 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9406 #endif /* SKYWALK */
9407 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9408 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9409 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9410 		lck_mtx_unlock(&ifp->if_poll_lock);
9411 
9412 		/* wait for poller thread to terminate */
9413 		lck_mtx_lock(&ifp->if_poll_lock);
9414 		while (ifp->if_poll_thread != THREAD_NULL) {
9415 			if (dlil_verbose) {
9416 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9417 				    __func__,
9418 				    if_name(ifp));
9419 			}
9420 			(void) msleep(&ifp->if_poll_thread,
9421 			    &ifp->if_poll_lock, (PZERO - 1),
9422 			    "ifnet_poll_thread_exit", NULL);
9423 		}
9424 		lck_mtx_unlock(&ifp->if_poll_lock);
9425 		if (dlil_verbose) {
9426 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9427 			    __func__, if_name(ifp));
9428 		}
9429 	}
9430 
9431 	/*
9432 	 * If thread affinity was set for the workloop thread, we will need
9433 	 * to tear down the affinity and release the extra reference count
9434 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9435 	 * without dedicated input threads.
9436 	 */
9437 	if ((inp = ifp->if_inp) != NULL) {
9438 		VERIFY(inp != dlil_main_input_thread);
9439 
9440 		if (inp->dlth_affinity) {
9441 			struct thread *tp, *wtp, *ptp;
9442 
9443 			lck_mtx_lock_spin(&inp->dlth_lock);
9444 			wtp = inp->dlth_driver_thread;
9445 			inp->dlth_driver_thread = THREAD_NULL;
9446 			ptp = inp->dlth_poller_thread;
9447 			inp->dlth_poller_thread = THREAD_NULL;
9448 			ASSERT(inp->dlth_thread != THREAD_NULL);
9449 			tp = inp->dlth_thread;    /* don't nullify now */
9450 			inp->dlth_affinity_tag = 0;
9451 			inp->dlth_affinity = FALSE;
9452 			lck_mtx_unlock(&inp->dlth_lock);
9453 
9454 			/* Tear down poll thread affinity */
9455 			if (ptp != NULL) {
9456 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9457 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9458 				(void) dlil_affinity_set(ptp,
9459 				    THREAD_AFFINITY_TAG_NULL);
9460 				thread_deallocate(ptp);
9461 			}
9462 
9463 			/* Tear down workloop thread affinity */
9464 			if (wtp != NULL) {
9465 				(void) dlil_affinity_set(wtp,
9466 				    THREAD_AFFINITY_TAG_NULL);
9467 				thread_deallocate(wtp);
9468 			}
9469 
9470 			/* Tear down DLIL input thread affinity */
9471 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9472 			thread_deallocate(tp);
9473 		}
9474 
9475 		/* disassociate ifp DLIL input thread */
9476 		ifp->if_inp = NULL;
9477 
9478 		/* if the worker thread was created, tell it to terminate */
9479 		if (inp->dlth_thread != THREAD_NULL) {
9480 			lck_mtx_lock_spin(&inp->dlth_lock);
9481 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9482 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9483 				wakeup_one((caddr_t)&inp->dlth_flags);
9484 			}
9485 			lck_mtx_unlock(&inp->dlth_lock);
9486 			ifnet_lock_done(ifp);
9487 
9488 			/* wait for the input thread to terminate */
9489 			lck_mtx_lock_spin(&inp->dlth_lock);
9490 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9491 			    == 0) {
9492 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9493 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9494 			}
9495 			lck_mtx_unlock(&inp->dlth_lock);
9496 			ifnet_lock_exclusive(ifp);
9497 		}
9498 
9499 		/* clean-up input thread state */
9500 		dlil_clean_threading_info(inp);
9501 		/* clean-up poll parameters */
9502 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9503 		dlil_reset_rxpoll_params(ifp);
9504 	}
9505 
9506 	/* The driver might unload, so point these to ourselves */
9507 	if_free = ifp->if_free;
9508 	ifp->if_output_dlil = ifp_if_output;
9509 	ifp->if_output = ifp_if_output;
9510 	ifp->if_pre_enqueue = ifp_if_output;
9511 	ifp->if_start = ifp_if_start;
9512 	ifp->if_output_ctl = ifp_if_ctl;
9513 	ifp->if_input_dlil = ifp_if_input;
9514 	ifp->if_input_poll = ifp_if_input_poll;
9515 	ifp->if_input_ctl = ifp_if_ctl;
9516 	ifp->if_ioctl = ifp_if_ioctl;
9517 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9518 	ifp->if_free = ifp_if_free;
9519 	ifp->if_demux = ifp_if_demux;
9520 	ifp->if_event = ifp_if_event;
9521 	ifp->if_framer_legacy = ifp_if_framer;
9522 	ifp->if_framer = ifp_if_framer_extended;
9523 	ifp->if_add_proto = ifp_if_add_proto;
9524 	ifp->if_del_proto = ifp_if_del_proto;
9525 	ifp->if_check_multi = ifp_if_check_multi;
9526 
9527 	/* wipe out interface description */
9528 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9529 	ifp->if_desc.ifd_len = 0;
9530 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9531 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9532 
9533 	/* there shouldn't be any delegation by now */
9534 	VERIFY(ifp->if_delegated.ifp == NULL);
9535 	VERIFY(ifp->if_delegated.type == 0);
9536 	VERIFY(ifp->if_delegated.family == 0);
9537 	VERIFY(ifp->if_delegated.subfamily == 0);
9538 	VERIFY(ifp->if_delegated.expensive == 0);
9539 	VERIFY(ifp->if_delegated.constrained == 0);
9540 
9541 	/* QoS marking get cleared */
9542 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9543 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9544 
9545 #if SKYWALK
9546 	/* the nexus destructor is responsible for clearing these */
9547 	VERIFY(ifp->if_na_ops == NULL);
9548 	VERIFY(ifp->if_na == NULL);
9549 #endif /* SKYWALK */
9550 
9551 	/* promiscuous count needs to start at zero again */
9552 	ifp->if_pcount = 0;
9553 	ifp->if_flags &= ~IFF_PROMISC;
9554 
9555 	ifnet_lock_done(ifp);
9556 
9557 #if PF
9558 	/*
9559 	 * Detach this interface from packet filter, if enabled.
9560 	 */
9561 	pf_ifnet_hook(ifp, 0);
9562 #endif /* PF */
9563 
9564 	/* Filter list should be empty */
9565 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9566 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9567 	VERIFY(ifp->if_flt_busy == 0);
9568 	VERIFY(ifp->if_flt_waiters == 0);
9569 	VERIFY(ifp->if_flt_non_os_count == 0);
9570 	VERIFY(ifp->if_flt_no_tso_count == 0);
9571 	lck_mtx_unlock(&ifp->if_flt_lock);
9572 
9573 	/* Last chance to drain send queue */
9574 	if_qflush_snd(ifp, 0);
9575 
9576 	/* Last chance to cleanup any cached route */
9577 	lck_mtx_lock(&ifp->if_cached_route_lock);
9578 	VERIFY(!ifp->if_fwd_cacheok);
9579 	ROUTE_RELEASE(&ifp->if_fwd_route);
9580 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9581 	ROUTE_RELEASE(&ifp->if_src_route);
9582 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9583 	ROUTE_RELEASE(&ifp->if_src_route6);
9584 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9585 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9586 
9587 	VERIFY(ifp->if_data_threshold == 0);
9588 	VERIFY(ifp->if_dt_tcall != NULL);
9589 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9590 
9591 	ifnet_llreach_ifdetach(ifp);
9592 
9593 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0);
9594 
9595 	/*
9596 	 * Finally, mark this ifnet as detached.
9597 	 */
9598 	if (dlil_verbose) {
9599 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9600 	}
9601 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9602 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9603 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9604 		    __func__, ifp);
9605 		/* NOTREACHED */
9606 	}
9607 	ifp->if_refflags &= ~IFRF_DETACHING;
9608 	lck_mtx_unlock(&ifp->if_ref_lock);
9609 	if (if_free != NULL) {
9610 		if_free(ifp);
9611 	}
9612 
9613 	ifclassq_release(&ifp->if_snd);
9614 
9615 	/* we're fully detached, clear the "in use" bit */
9616 	dlifp = (struct dlil_ifnet *)ifp;
9617 	lck_mtx_lock(&dlifp->dl_if_lock);
9618 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9619 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9620 	lck_mtx_unlock(&dlifp->dl_if_lock);
9621 
9622 	/* Release reference held during ifnet attach */
9623 	ifnet_release(ifp);
9624 }
9625 
9626 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9627 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9628 {
9629 #pragma unused(ifp)
9630 	m_freem_list(m);
9631 	return 0;
9632 }
9633 
9634 void
ifp_if_start(struct ifnet * ifp)9635 ifp_if_start(struct ifnet *ifp)
9636 {
9637 	ifnet_purge(ifp);
9638 }
9639 
9640 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9641 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9642     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9643     boolean_t poll, struct thread *tp)
9644 {
9645 #pragma unused(ifp, m_tail, s, poll, tp)
9646 	m_freem_list(m_head);
9647 	return ENXIO;
9648 }
9649 
9650 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9651 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9652     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9653 {
9654 #pragma unused(ifp, flags, max_cnt)
9655 	if (m_head != NULL) {
9656 		*m_head = NULL;
9657 	}
9658 	if (m_tail != NULL) {
9659 		*m_tail = NULL;
9660 	}
9661 	if (cnt != NULL) {
9662 		*cnt = 0;
9663 	}
9664 	if (len != NULL) {
9665 		*len = 0;
9666 	}
9667 }
9668 
9669 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9670 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9671 {
9672 #pragma unused(ifp, cmd, arglen, arg)
9673 	return EOPNOTSUPP;
9674 }
9675 
9676 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9677 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9678 {
9679 #pragma unused(ifp, fh, pf)
9680 	m_freem(m);
9681 	return EJUSTRETURN;
9682 }
9683 
9684 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9685 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9686     const struct ifnet_demux_desc *da, u_int32_t dc)
9687 {
9688 #pragma unused(ifp, pf, da, dc)
9689 	return EINVAL;
9690 }
9691 
9692 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9693 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9694 {
9695 #pragma unused(ifp, pf)
9696 	return EINVAL;
9697 }
9698 
9699 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9700 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9701 {
9702 #pragma unused(ifp, sa)
9703 	return EOPNOTSUPP;
9704 }
9705 
9706 #if !XNU_TARGET_OS_OSX
9707 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9708 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9709     const struct sockaddr *sa, const char *ll, const char *t,
9710     u_int32_t *pre, u_int32_t *post)
9711 #else /* XNU_TARGET_OS_OSX */
9712 static errno_t
9713 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9714     const struct sockaddr *sa, const char *ll, const char *t)
9715 #endif /* XNU_TARGET_OS_OSX */
9716 {
9717 #pragma unused(ifp, m, sa, ll, t)
9718 #if !XNU_TARGET_OS_OSX
9719 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9720 #else /* XNU_TARGET_OS_OSX */
9721 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
9722 #endif /* XNU_TARGET_OS_OSX */
9723 }
9724 
9725 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9726 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
9727     const struct sockaddr *sa, const char *ll, const char *t,
9728     u_int32_t *pre, u_int32_t *post)
9729 {
9730 #pragma unused(ifp, sa, ll, t)
9731 	m_freem(*m);
9732 	*m = NULL;
9733 
9734 	if (pre != NULL) {
9735 		*pre = 0;
9736 	}
9737 	if (post != NULL) {
9738 		*post = 0;
9739 	}
9740 
9741 	return EJUSTRETURN;
9742 }
9743 
9744 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)9745 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
9746 {
9747 #pragma unused(ifp, cmd, arg)
9748 	return EOPNOTSUPP;
9749 }
9750 
9751 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)9752 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
9753 {
9754 #pragma unused(ifp, tm, f)
9755 	/* XXX not sure what to do here */
9756 	return 0;
9757 }
9758 
9759 static void
ifp_if_free(struct ifnet * ifp)9760 ifp_if_free(struct ifnet *ifp)
9761 {
9762 #pragma unused(ifp)
9763 }
9764 
9765 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)9766 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
9767 {
9768 #pragma unused(ifp, e)
9769 }
9770 
9771 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)9772 dlil_if_acquire(u_int32_t family, const void *uniqueid,
9773     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
9774 {
9775 	struct ifnet *ifp1 = NULL;
9776 	struct dlil_ifnet *dlifp1 = NULL;
9777 	struct dlil_ifnet *dlifp1_saved = NULL;
9778 	void *buf, *base, **pbuf;
9779 	int ret = 0;
9780 
9781 	VERIFY(*ifp == NULL);
9782 	dlil_if_lock();
9783 	/*
9784 	 * We absolutely can't have an interface with the same name
9785 	 * in in-use state.
9786 	 * To make sure of that list has to be traversed completely
9787 	 */
9788 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
9789 		ifp1 = (struct ifnet *)dlifp1;
9790 
9791 		if (ifp1->if_family != family) {
9792 			continue;
9793 		}
9794 
9795 		/*
9796 		 * If interface is in use, return EBUSY if either unique id
9797 		 * or interface extended names are the same
9798 		 */
9799 		lck_mtx_lock(&dlifp1->dl_if_lock);
9800 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
9801 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9802 			lck_mtx_unlock(&dlifp1->dl_if_lock);
9803 			ret = EBUSY;
9804 			goto end;
9805 		}
9806 
9807 		if (uniqueid_len != 0 &&
9808 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
9809 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
9810 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9811 				lck_mtx_unlock(&dlifp1->dl_if_lock);
9812 				ret = EBUSY;
9813 				goto end;
9814 			}
9815 			if (dlifp1_saved == NULL) {
9816 				/* cache the first match */
9817 				dlifp1_saved = dlifp1;
9818 			}
9819 			/*
9820 			 * Do not break or jump to end as we have to traverse
9821 			 * the whole list to ensure there are no name collisions
9822 			 */
9823 		}
9824 		lck_mtx_unlock(&dlifp1->dl_if_lock);
9825 	}
9826 
9827 	/* If there's an interface that can be recycled, use that */
9828 	if (dlifp1_saved != NULL) {
9829 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
9830 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
9831 			/* some other thread got in ahead of us */
9832 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9833 			ret = EBUSY;
9834 			goto end;
9835 		}
9836 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
9837 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9838 		*ifp = (struct ifnet *)dlifp1_saved;
9839 		dlil_if_ref(*ifp);
9840 		goto end;
9841 	}
9842 
9843 	/* no interface found, allocate a new one */
9844 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9845 
9846 	/* Get the 64-bit aligned base address for this object */
9847 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
9848 	    sizeof(u_int64_t));
9849 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
9850 
9851 	/*
9852 	 * Wind back a pointer size from the aligned base and
9853 	 * save the original address so we can free it later.
9854 	 */
9855 	pbuf = (void **)((intptr_t)base - sizeof(void *));
9856 	*pbuf = buf;
9857 	dlifp1 = base;
9858 
9859 	if (uniqueid_len) {
9860 		MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len,
9861 		    M_NKE, M_WAITOK);
9862 		if (dlifp1->dl_if_uniqueid == NULL) {
9863 			zfree(dlif_zone, buf);
9864 			ret = ENOMEM;
9865 			goto end;
9866 		}
9867 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
9868 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
9869 	}
9870 
9871 	ifp1 = (struct ifnet *)dlifp1;
9872 	dlifp1->dl_if_flags = DLIF_INUSE;
9873 	if (ifnet_debug) {
9874 		dlifp1->dl_if_flags |= DLIF_DEBUG;
9875 		dlifp1->dl_if_trace = dlil_if_trace;
9876 	}
9877 	ifp1->if_name = dlifp1->dl_if_namestorage;
9878 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
9879 
9880 	/* initialize interface description */
9881 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
9882 	ifp1->if_desc.ifd_len = 0;
9883 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
9884 
9885 #if SKYWALK
9886 	SLIST_INIT(&ifp1->if_netns_tokens);
9887 #endif /* SKYWALK */
9888 
9889 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
9890 		DLIL_PRINTF("%s: failed to allocate if local stats, "
9891 		    "error: %d\n", __func__, ret);
9892 		/* This probably shouldn't be fatal */
9893 		ret = 0;
9894 	}
9895 
9896 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9897 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9898 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
9899 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
9900 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
9901 	    &ifnet_lock_attr);
9902 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
9903 #if INET
9904 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
9905 	    &ifnet_lock_attr);
9906 	ifp1->if_inetdata = NULL;
9907 #endif
9908 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
9909 	    &ifnet_lock_attr);
9910 	ifp1->if_inet6data = NULL;
9911 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
9912 	    &ifnet_lock_attr);
9913 	ifp1->if_link_status = NULL;
9914 
9915 	/* for send data paths */
9916 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
9917 	    &ifnet_lock_attr);
9918 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
9919 	    &ifnet_lock_attr);
9920 
9921 	/* for receive data paths */
9922 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
9923 	    &ifnet_lock_attr);
9924 
9925 	/* thread call allocation is done with sleeping zalloc */
9926 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
9927 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
9928 	if (ifp1->if_dt_tcall == NULL) {
9929 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
9930 		/* NOTREACHED */
9931 	}
9932 
9933 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
9934 
9935 	*ifp = ifp1;
9936 	dlil_if_ref(*ifp);
9937 
9938 end:
9939 	dlil_if_unlock();
9940 
9941 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
9942 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
9943 
9944 	return ret;
9945 }
9946 
9947 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)9948 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
9949 {
9950 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
9951 
9952 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
9953 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
9954 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
9955 	}
9956 
9957 	ifnet_lock_exclusive(ifp);
9958 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
9959 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
9960 		ifp->if_broadcast.length = 0;
9961 		ifp->if_broadcast.u.ptr = NULL;
9962 	}
9963 	lck_mtx_lock(&dlifp->dl_if_lock);
9964 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
9965 	ifp->if_name = dlifp->dl_if_namestorage;
9966 	/* Reset external name (name + unit) */
9967 	ifp->if_xname = dlifp->dl_if_xnamestorage;
9968 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
9969 	    "%s?", ifp->if_name);
9970 	if (clear_in_use) {
9971 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9972 		dlifp->dl_if_flags &= ~DLIF_INUSE;
9973 	}
9974 	lck_mtx_unlock(&dlifp->dl_if_lock);
9975 	ifnet_lock_done(ifp);
9976 }
9977 
9978 __private_extern__ void
dlil_if_release(ifnet_t ifp)9979 dlil_if_release(ifnet_t ifp)
9980 {
9981 	_dlil_if_release(ifp, false);
9982 }
9983 
9984 __private_extern__ void
dlil_if_lock(void)9985 dlil_if_lock(void)
9986 {
9987 	lck_mtx_lock(&dlil_ifnet_lock);
9988 }
9989 
9990 __private_extern__ void
dlil_if_unlock(void)9991 dlil_if_unlock(void)
9992 {
9993 	lck_mtx_unlock(&dlil_ifnet_lock);
9994 }
9995 
9996 __private_extern__ void
dlil_if_lock_assert(void)9997 dlil_if_lock_assert(void)
9998 {
9999 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10000 }
10001 
10002 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10003 dlil_proto_unplumb_all(struct ifnet *ifp)
10004 {
10005 	/*
10006 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10007 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10008 	 * explicit unplumb.
10009 	 *
10010 	 * if_proto_hash[3] is for other protocols; we expect anything
10011 	 * in this bucket to respond to the DETACHING event (which would
10012 	 * have happened by now) and do the unplumb then.
10013 	 */
10014 	(void) proto_unplumb(PF_INET, ifp);
10015 	(void) proto_unplumb(PF_INET6, ifp);
10016 }
10017 
10018 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10019 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10020 {
10021 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10022 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10023 
10024 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10025 
10026 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10027 }
10028 
10029 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10030 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10031 {
10032 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10033 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10034 
10035 	if (ifp->if_fwd_cacheok) {
10036 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10037 	} else {
10038 		ROUTE_RELEASE(src);
10039 	}
10040 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10041 }
10042 
10043 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10044 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10045 {
10046 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10047 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10048 
10049 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10050 	    sizeof(*dst));
10051 
10052 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10053 }
10054 
10055 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10056 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10057 {
10058 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10059 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10060 
10061 	if (ifp->if_fwd_cacheok) {
10062 		route_copyin((struct route *)src,
10063 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10064 	} else {
10065 		ROUTE_RELEASE(src);
10066 	}
10067 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10068 }
10069 
10070 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10071 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10072 {
10073 	struct route            src_rt;
10074 	struct sockaddr_in      *dst;
10075 
10076 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10077 
10078 	ifp_src_route_copyout(ifp, &src_rt);
10079 
10080 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10081 		ROUTE_RELEASE(&src_rt);
10082 		if (dst->sin_family != AF_INET) {
10083 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10084 			dst->sin_len = sizeof(src_rt.ro_dst);
10085 			dst->sin_family = AF_INET;
10086 		}
10087 		dst->sin_addr = src_ip;
10088 
10089 		VERIFY(src_rt.ro_rt == NULL);
10090 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10091 		    0, 0, ifp->if_index);
10092 
10093 		if (src_rt.ro_rt != NULL) {
10094 			/* retain a ref, copyin consumes one */
10095 			struct rtentry  *rte = src_rt.ro_rt;
10096 			RT_ADDREF(rte);
10097 			ifp_src_route_copyin(ifp, &src_rt);
10098 			src_rt.ro_rt = rte;
10099 		}
10100 	}
10101 
10102 	return src_rt.ro_rt;
10103 }
10104 
10105 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10106 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10107 {
10108 	struct route_in6 src_rt;
10109 
10110 	ifp_src_route6_copyout(ifp, &src_rt);
10111 
10112 	if (ROUTE_UNUSABLE(&src_rt) ||
10113 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10114 		ROUTE_RELEASE(&src_rt);
10115 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10116 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10117 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10118 			src_rt.ro_dst.sin6_family = AF_INET6;
10119 		}
10120 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10121 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10122 		    sizeof(src_rt.ro_dst.sin6_addr));
10123 
10124 		if (src_rt.ro_rt == NULL) {
10125 			src_rt.ro_rt = rtalloc1_scoped(
10126 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10127 				ifp->if_index);
10128 
10129 			if (src_rt.ro_rt != NULL) {
10130 				/* retain a ref, copyin consumes one */
10131 				struct rtentry  *rte = src_rt.ro_rt;
10132 				RT_ADDREF(rte);
10133 				ifp_src_route6_copyin(ifp, &src_rt);
10134 				src_rt.ro_rt = rte;
10135 			}
10136 		}
10137 	}
10138 
10139 	return src_rt.ro_rt;
10140 }
10141 
10142 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10143 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10144 {
10145 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10146 
10147 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10148 
10149 	/* Normalize to edge */
10150 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10151 		lqm = IFNET_LQM_THRESH_ABORT;
10152 		atomic_bitset_32(&tcbinfo.ipi_flags,
10153 		    INPCBINFO_HANDLE_LQM_ABORT);
10154 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10155 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10156 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10157 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10158 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10159 	    lqm <= IFNET_LQM_THRESH_POOR) {
10160 		lqm = IFNET_LQM_THRESH_POOR;
10161 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10162 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10163 		lqm = IFNET_LQM_THRESH_GOOD;
10164 	}
10165 
10166 	/*
10167 	 * Take the lock if needed
10168 	 */
10169 	if (!locked) {
10170 		ifnet_lock_exclusive(ifp);
10171 	}
10172 
10173 	if (lqm == ifp->if_interface_state.lqm_state &&
10174 	    (ifp->if_interface_state.valid_bitmask &
10175 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10176 		/*
10177 		 * Release the lock if was not held by the caller
10178 		 */
10179 		if (!locked) {
10180 			ifnet_lock_done(ifp);
10181 		}
10182 		return;         /* nothing to update */
10183 	}
10184 	ifp->if_interface_state.valid_bitmask |=
10185 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10186 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10187 
10188 	/*
10189 	 * Don't want to hold the lock when issuing kernel events
10190 	 */
10191 	ifnet_lock_done(ifp);
10192 
10193 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10194 	ev_lqm_data.link_quality_metric = lqm;
10195 
10196 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10197 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data));
10198 
10199 	/*
10200 	 * Reacquire the lock for the caller
10201 	 */
10202 	if (locked) {
10203 		ifnet_lock_exclusive(ifp);
10204 	}
10205 }
10206 
10207 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10208 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10209 {
10210 	struct kev_dl_rrc_state kev;
10211 
10212 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10213 	    (ifp->if_interface_state.valid_bitmask &
10214 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10215 		return;
10216 	}
10217 
10218 	ifp->if_interface_state.valid_bitmask |=
10219 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10220 
10221 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10222 
10223 	/*
10224 	 * Don't want to hold the lock when issuing kernel events
10225 	 */
10226 	ifnet_lock_done(ifp);
10227 
10228 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10229 	kev.rrc_state = rrc_state;
10230 
10231 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10232 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state));
10233 
10234 	ifnet_lock_exclusive(ifp);
10235 }
10236 
10237 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10238 if_state_update(struct ifnet *ifp,
10239     struct if_interface_state *if_interface_state)
10240 {
10241 	u_short if_index_available = 0;
10242 
10243 	ifnet_lock_exclusive(ifp);
10244 
10245 	if ((ifp->if_type != IFT_CELLULAR) &&
10246 	    (if_interface_state->valid_bitmask &
10247 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10248 		ifnet_lock_done(ifp);
10249 		return ENOTSUP;
10250 	}
10251 	if ((if_interface_state->valid_bitmask &
10252 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10253 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10254 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10255 		ifnet_lock_done(ifp);
10256 		return EINVAL;
10257 	}
10258 	if ((if_interface_state->valid_bitmask &
10259 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10260 	    if_interface_state->rrc_state !=
10261 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10262 	    if_interface_state->rrc_state !=
10263 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10264 		ifnet_lock_done(ifp);
10265 		return EINVAL;
10266 	}
10267 
10268 	if (if_interface_state->valid_bitmask &
10269 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10270 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10271 	}
10272 	if (if_interface_state->valid_bitmask &
10273 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10274 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10275 	}
10276 	if (if_interface_state->valid_bitmask &
10277 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10278 		ifp->if_interface_state.valid_bitmask |=
10279 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10280 		ifp->if_interface_state.interface_availability =
10281 		    if_interface_state->interface_availability;
10282 
10283 		if (ifp->if_interface_state.interface_availability ==
10284 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10285 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10286 			    __func__, if_name(ifp), ifp->if_index);
10287 			if_index_available = ifp->if_index;
10288 		} else {
10289 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10290 			    __func__, if_name(ifp), ifp->if_index);
10291 		}
10292 	}
10293 	ifnet_lock_done(ifp);
10294 
10295 	/*
10296 	 * Check if the TCP connections going on this interface should be
10297 	 * forced to send probe packets instead of waiting for TCP timers
10298 	 * to fire. This is done on an explicit notification such as
10299 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10300 	 */
10301 	if (if_index_available > 0) {
10302 		tcp_interface_send_probe(if_index_available);
10303 	}
10304 
10305 	return 0;
10306 }
10307 
10308 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10309 if_get_state(struct ifnet *ifp,
10310     struct if_interface_state *if_interface_state)
10311 {
10312 	ifnet_lock_shared(ifp);
10313 
10314 	if_interface_state->valid_bitmask = 0;
10315 
10316 	if (ifp->if_interface_state.valid_bitmask &
10317 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10318 		if_interface_state->valid_bitmask |=
10319 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10320 		if_interface_state->rrc_state =
10321 		    ifp->if_interface_state.rrc_state;
10322 	}
10323 	if (ifp->if_interface_state.valid_bitmask &
10324 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10325 		if_interface_state->valid_bitmask |=
10326 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10327 		if_interface_state->lqm_state =
10328 		    ifp->if_interface_state.lqm_state;
10329 	}
10330 	if (ifp->if_interface_state.valid_bitmask &
10331 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10332 		if_interface_state->valid_bitmask |=
10333 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10334 		if_interface_state->interface_availability =
10335 		    ifp->if_interface_state.interface_availability;
10336 	}
10337 
10338 	ifnet_lock_done(ifp);
10339 }
10340 
10341 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10342 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10343 {
10344 	if (conn_probe > 1) {
10345 		return EINVAL;
10346 	}
10347 	if (conn_probe == 0) {
10348 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10349 	} else {
10350 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10351 	}
10352 
10353 #if NECP
10354 	necp_update_all_clients();
10355 #endif /* NECP */
10356 
10357 	tcp_probe_connectivity(ifp, conn_probe);
10358 	return 0;
10359 }
10360 
10361 /* for uuid.c */
10362 static int
get_ether_index(int * ret_other_index)10363 get_ether_index(int * ret_other_index)
10364 {
10365 	struct ifnet *ifp;
10366 	int en0_index = 0;
10367 	int other_en_index = 0;
10368 	int any_ether_index = 0;
10369 	short best_unit = 0;
10370 
10371 	*ret_other_index = 0;
10372 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10373 		/*
10374 		 * find en0, or if not en0, the lowest unit en*, and if not
10375 		 * that, any ethernet
10376 		 */
10377 		ifnet_lock_shared(ifp);
10378 		if (strcmp(ifp->if_name, "en") == 0) {
10379 			if (ifp->if_unit == 0) {
10380 				/* found en0, we're done */
10381 				en0_index = ifp->if_index;
10382 				ifnet_lock_done(ifp);
10383 				break;
10384 			}
10385 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10386 				other_en_index = ifp->if_index;
10387 				best_unit = ifp->if_unit;
10388 			}
10389 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10390 			any_ether_index = ifp->if_index;
10391 		}
10392 		ifnet_lock_done(ifp);
10393 	}
10394 	if (en0_index == 0) {
10395 		if (other_en_index != 0) {
10396 			*ret_other_index = other_en_index;
10397 		} else if (any_ether_index != 0) {
10398 			*ret_other_index = any_ether_index;
10399 		}
10400 	}
10401 	return en0_index;
10402 }
10403 
10404 int
uuid_get_ethernet(u_int8_t * node)10405 uuid_get_ethernet(u_int8_t *node)
10406 {
10407 	static int en0_index;
10408 	struct ifnet *ifp;
10409 	int other_index = 0;
10410 	int the_index = 0;
10411 	int ret;
10412 
10413 	ifnet_head_lock_shared();
10414 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10415 		en0_index = get_ether_index(&other_index);
10416 	}
10417 	if (en0_index != 0) {
10418 		the_index = en0_index;
10419 	} else if (other_index != 0) {
10420 		the_index = other_index;
10421 	}
10422 	if (the_index != 0) {
10423 		struct dlil_ifnet *dl_if;
10424 
10425 		ifp = ifindex2ifnet[the_index];
10426 		VERIFY(ifp != NULL);
10427 		dl_if = (struct dlil_ifnet *)ifp;
10428 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10429 			/*
10430 			 * Use the permanent ethernet address if it is
10431 			 * available because it will never change.
10432 			 */
10433 			memcpy(node, dl_if->dl_if_permanent_ether,
10434 			    ETHER_ADDR_LEN);
10435 		} else {
10436 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10437 		}
10438 		ret = 0;
10439 	} else {
10440 		ret = -1;
10441 	}
10442 	ifnet_head_done();
10443 	return ret;
10444 }
10445 
10446 static int
10447 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10448 {
10449 #pragma unused(arg1, arg2)
10450 	uint32_t i;
10451 	int err;
10452 
10453 	i = if_rxpoll;
10454 
10455 	err = sysctl_handle_int(oidp, &i, 0, req);
10456 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10457 		return err;
10458 	}
10459 
10460 	if (net_rxpoll == 0) {
10461 		return ENXIO;
10462 	}
10463 
10464 	if_rxpoll = i;
10465 	return err;
10466 }
10467 
10468 static int
10469 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10470 {
10471 #pragma unused(arg1, arg2)
10472 	uint64_t q;
10473 	int err;
10474 
10475 	q = if_rxpoll_mode_holdtime;
10476 
10477 	err = sysctl_handle_quad(oidp, &q, 0, req);
10478 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10479 		return err;
10480 	}
10481 
10482 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10483 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10484 	}
10485 
10486 	if_rxpoll_mode_holdtime = q;
10487 
10488 	return err;
10489 }
10490 
10491 static int
10492 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10493 {
10494 #pragma unused(arg1, arg2)
10495 	uint64_t q;
10496 	int err;
10497 
10498 	q = if_rxpoll_sample_holdtime;
10499 
10500 	err = sysctl_handle_quad(oidp, &q, 0, req);
10501 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10502 		return err;
10503 	}
10504 
10505 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10506 		q = IF_RXPOLL_SAMPLETIME_MIN;
10507 	}
10508 
10509 	if_rxpoll_sample_holdtime = q;
10510 
10511 	return err;
10512 }
10513 
10514 static int
10515 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10516 {
10517 #pragma unused(arg1, arg2)
10518 	uint64_t q;
10519 	int err;
10520 
10521 	q = if_rxpoll_interval_time;
10522 
10523 	err = sysctl_handle_quad(oidp, &q, 0, req);
10524 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10525 		return err;
10526 	}
10527 
10528 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10529 		q = IF_RXPOLL_INTERVALTIME_MIN;
10530 	}
10531 
10532 	if_rxpoll_interval_time = q;
10533 
10534 	return err;
10535 }
10536 
10537 static int
10538 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10539 {
10540 #pragma unused(arg1, arg2)
10541 	uint32_t i;
10542 	int err;
10543 
10544 	i = if_sysctl_rxpoll_wlowat;
10545 
10546 	err = sysctl_handle_int(oidp, &i, 0, req);
10547 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10548 		return err;
10549 	}
10550 
10551 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10552 		return EINVAL;
10553 	}
10554 
10555 	if_sysctl_rxpoll_wlowat = i;
10556 	return err;
10557 }
10558 
10559 static int
10560 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10561 {
10562 #pragma unused(arg1, arg2)
10563 	uint32_t i;
10564 	int err;
10565 
10566 	i = if_sysctl_rxpoll_whiwat;
10567 
10568 	err = sysctl_handle_int(oidp, &i, 0, req);
10569 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10570 		return err;
10571 	}
10572 
10573 	if (i <= if_sysctl_rxpoll_wlowat) {
10574 		return EINVAL;
10575 	}
10576 
10577 	if_sysctl_rxpoll_whiwat = i;
10578 	return err;
10579 }
10580 
10581 static int
10582 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10583 {
10584 #pragma unused(arg1, arg2)
10585 	int i, err;
10586 
10587 	i = if_sndq_maxlen;
10588 
10589 	err = sysctl_handle_int(oidp, &i, 0, req);
10590 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10591 		return err;
10592 	}
10593 
10594 	if (i < IF_SNDQ_MINLEN) {
10595 		i = IF_SNDQ_MINLEN;
10596 	}
10597 
10598 	if_sndq_maxlen = i;
10599 	return err;
10600 }
10601 
10602 static int
10603 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10604 {
10605 #pragma unused(arg1, arg2)
10606 	int i, err;
10607 
10608 	i = if_rcvq_maxlen;
10609 
10610 	err = sysctl_handle_int(oidp, &i, 0, req);
10611 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10612 		return err;
10613 	}
10614 
10615 	if (i < IF_RCVQ_MINLEN) {
10616 		i = IF_RCVQ_MINLEN;
10617 	}
10618 
10619 	if_rcvq_maxlen = i;
10620 	return err;
10621 }
10622 
10623 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10624 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10625     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10626 {
10627 	struct kev_dl_node_presence kev;
10628 	struct sockaddr_dl *sdl;
10629 	struct sockaddr_in6 *sin6;
10630 	int ret = 0;
10631 
10632 	VERIFY(ifp);
10633 	VERIFY(sa);
10634 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10635 
10636 	bzero(&kev, sizeof(kev));
10637 	sin6 = &kev.sin6_node_address;
10638 	sdl = &kev.sdl_node_address;
10639 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10640 	kev.rssi = rssi;
10641 	kev.link_quality_metric = lqm;
10642 	kev.node_proximity_metric = npm;
10643 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10644 
10645 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10646 	if (ret == 0) {
10647 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10648 		    &kev.link_data, sizeof(kev));
10649 		if (err != 0) {
10650 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10651 			    "error %d\n", __func__, err);
10652 		}
10653 	}
10654 	return ret;
10655 }
10656 
10657 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10658 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10659 {
10660 	struct kev_dl_node_absence kev = {};
10661 	struct sockaddr_in6 *kev_sin6 = NULL;
10662 	struct sockaddr_dl *kev_sdl = NULL;
10663 
10664 	VERIFY(ifp != NULL);
10665 	VERIFY(sa != NULL);
10666 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10667 
10668 	kev_sin6 = &kev.sin6_node_address;
10669 	kev_sdl = &kev.sdl_node_address;
10670 
10671 	if (sa->sa_family == AF_INET6) {
10672 		/*
10673 		 * If IPv6 address is given, get the link layer
10674 		 * address from what was cached in the neighbor cache
10675 		 */
10676 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10677 		bcopy(sa, kev_sin6, sa->sa_len);
10678 		nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10679 	} else {
10680 		/*
10681 		 * If passed address is AF_LINK type, derive the address
10682 		 * based on the link address.
10683 		 */
10684 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10685 		nd6_alt_node_absent(ifp, kev_sin6, NULL);
10686 	}
10687 
10688 	kev_sdl->sdl_type = ifp->if_type;
10689 	kev_sdl->sdl_index = ifp->if_index;
10690 
10691 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10692 	    &kev.link_data, sizeof(kev));
10693 }
10694 
10695 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10696 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10697     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10698 {
10699 	struct kev_dl_node_presence kev = {};
10700 	struct sockaddr_dl *kev_sdl = NULL;
10701 	struct sockaddr_in6 *kev_sin6 = NULL;
10702 	int ret = 0;
10703 
10704 	VERIFY(ifp != NULL);
10705 	VERIFY(sa != NULL && sdl != NULL);
10706 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10707 
10708 	kev_sin6 = &kev.sin6_node_address;
10709 	kev_sdl = &kev.sdl_node_address;
10710 
10711 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10712 	bcopy(sdl, kev_sdl, sdl->sdl_len);
10713 	kev_sdl->sdl_type = ifp->if_type;
10714 	kev_sdl->sdl_index = ifp->if_index;
10715 
10716 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10717 	bcopy(sa, kev_sin6, sa->sa_len);
10718 
10719 	kev.rssi = rssi;
10720 	kev.link_quality_metric = lqm;
10721 	kev.node_proximity_metric = npm;
10722 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10723 
10724 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10725 	if (ret == 0) {
10726 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10727 		    &kev.link_data, sizeof(kev));
10728 		if (err != 0) {
10729 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10730 		}
10731 	}
10732 	return ret;
10733 }
10734 
10735 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10736 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10737     kauth_cred_t *credp)
10738 {
10739 	const u_int8_t *bytes;
10740 	size_t size;
10741 
10742 	bytes = CONST_LLADDR(sdl);
10743 	size = sdl->sdl_alen;
10744 
10745 #if CONFIG_MACF
10746 	if (dlil_lladdr_ckreq) {
10747 		switch (sdl->sdl_type) {
10748 		case IFT_ETHER:
10749 		case IFT_IEEE1394:
10750 			break;
10751 		default:
10752 			credp = NULL;
10753 			break;
10754 		}
10755 		;
10756 
10757 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10758 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10759 				[0] = 2
10760 			};
10761 
10762 			bytes = unspec;
10763 		}
10764 	}
10765 #else
10766 #pragma unused(credp)
10767 #endif
10768 
10769 	if (sizep != NULL) {
10770 		*sizep = size;
10771 	}
10772 	return bytes;
10773 }
10774 
10775 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10776 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10777     u_int8_t info[DLIL_MODARGLEN])
10778 {
10779 	struct kev_dl_issues kev;
10780 	struct timeval tv;
10781 
10782 	VERIFY(ifp != NULL);
10783 	VERIFY(modid != NULL);
10784 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10785 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10786 
10787 	bzero(&kev, sizeof(kev));
10788 
10789 	microtime(&tv);
10790 	kev.timestamp = tv.tv_sec;
10791 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10792 	if (info != NULL) {
10793 		bcopy(info, &kev.info, DLIL_MODARGLEN);
10794 	}
10795 
10796 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10797 	    &kev.link_data, sizeof(kev));
10798 }
10799 
10800 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10801 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10802     struct proc *p)
10803 {
10804 	u_int32_t level = IFNET_THROTTLE_OFF;
10805 	errno_t result = 0;
10806 
10807 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10808 
10809 	if (cmd == SIOCSIFOPPORTUNISTIC) {
10810 		/*
10811 		 * XXX: Use priv_check_cred() instead of root check?
10812 		 */
10813 		if ((result = proc_suser(p)) != 0) {
10814 			return result;
10815 		}
10816 
10817 		if (ifr->ifr_opportunistic.ifo_flags ==
10818 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
10819 			level = IFNET_THROTTLE_OPPORTUNISTIC;
10820 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10821 			level = IFNET_THROTTLE_OFF;
10822 		} else {
10823 			result = EINVAL;
10824 		}
10825 
10826 		if (result == 0) {
10827 			result = ifnet_set_throttle(ifp, level);
10828 		}
10829 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10830 		ifr->ifr_opportunistic.ifo_flags = 0;
10831 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10832 			ifr->ifr_opportunistic.ifo_flags |=
10833 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
10834 		}
10835 	}
10836 
10837 	/*
10838 	 * Return the count of current opportunistic connections
10839 	 * over the interface.
10840 	 */
10841 	if (result == 0) {
10842 		uint32_t flags = 0;
10843 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10844 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
10845 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10846 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10847 		ifr->ifr_opportunistic.ifo_inuse =
10848 		    udp_count_opportunistic(ifp->if_index, flags) +
10849 		    tcp_count_opportunistic(ifp->if_index, flags);
10850 	}
10851 
10852 	if (result == EALREADY) {
10853 		result = 0;
10854 	}
10855 
10856 	return result;
10857 }
10858 
10859 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10860 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10861 {
10862 	struct ifclassq *ifq;
10863 	int err = 0;
10864 
10865 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10866 		return ENXIO;
10867 	}
10868 
10869 	*level = IFNET_THROTTLE_OFF;
10870 
10871 	ifq = ifp->if_snd;
10872 	IFCQ_LOCK(ifq);
10873 	/* Throttling works only for IFCQ, not ALTQ instances */
10874 	if (IFCQ_IS_ENABLED(ifq)) {
10875 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10876 
10877 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10878 		*level = req.level;
10879 	}
10880 	IFCQ_UNLOCK(ifq);
10881 
10882 	return err;
10883 }
10884 
10885 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10886 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10887 {
10888 	struct ifclassq *ifq;
10889 	int err = 0;
10890 
10891 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10892 		return ENXIO;
10893 	}
10894 
10895 	ifq = ifp->if_snd;
10896 
10897 	switch (level) {
10898 	case IFNET_THROTTLE_OFF:
10899 	case IFNET_THROTTLE_OPPORTUNISTIC:
10900 		break;
10901 	default:
10902 		return EINVAL;
10903 	}
10904 
10905 	IFCQ_LOCK(ifq);
10906 	if (IFCQ_IS_ENABLED(ifq)) {
10907 		cqrq_throttle_t req = { 1, level };
10908 
10909 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10910 	}
10911 	IFCQ_UNLOCK(ifq);
10912 
10913 	if (err == 0) {
10914 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
10915 		    level);
10916 #if NECP
10917 		necp_update_all_clients();
10918 #endif /* NECP */
10919 		if (level == IFNET_THROTTLE_OFF) {
10920 			ifnet_start(ifp);
10921 		}
10922 	}
10923 
10924 	return err;
10925 }
10926 
10927 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10928 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10929     struct proc *p)
10930 {
10931 #pragma unused(p)
10932 	errno_t result = 0;
10933 	uint32_t flags;
10934 	int level, category, subcategory;
10935 
10936 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
10937 
10938 	if (cmd == SIOCSIFLOG) {
10939 		if ((result = priv_check_cred(kauth_cred_get(),
10940 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
10941 			return result;
10942 		}
10943 
10944 		level = ifr->ifr_log.ifl_level;
10945 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
10946 			result = EINVAL;
10947 		}
10948 
10949 		flags = ifr->ifr_log.ifl_flags;
10950 		if ((flags &= IFNET_LOGF_MASK) == 0) {
10951 			result = EINVAL;
10952 		}
10953 
10954 		category = ifr->ifr_log.ifl_category;
10955 		subcategory = ifr->ifr_log.ifl_subcategory;
10956 
10957 		if (result == 0) {
10958 			result = ifnet_set_log(ifp, level, flags,
10959 			    category, subcategory);
10960 		}
10961 	} else {
10962 		result = ifnet_get_log(ifp, &level, &flags, &category,
10963 		    &subcategory);
10964 		if (result == 0) {
10965 			ifr->ifr_log.ifl_level = level;
10966 			ifr->ifr_log.ifl_flags = flags;
10967 			ifr->ifr_log.ifl_category = category;
10968 			ifr->ifr_log.ifl_subcategory = subcategory;
10969 		}
10970 	}
10971 
10972 	return result;
10973 }
10974 
10975 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)10976 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
10977     int32_t category, int32_t subcategory)
10978 {
10979 	int err = 0;
10980 
10981 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
10982 	VERIFY(flags & IFNET_LOGF_MASK);
10983 
10984 	/*
10985 	 * The logging level applies to all facilities; make sure to
10986 	 * update them all with the most current level.
10987 	 */
10988 	flags |= ifp->if_log.flags;
10989 
10990 	if (ifp->if_output_ctl != NULL) {
10991 		struct ifnet_log_params l;
10992 
10993 		bzero(&l, sizeof(l));
10994 		l.level = level;
10995 		l.flags = flags;
10996 		l.flags &= ~IFNET_LOGF_DLIL;
10997 		l.category = category;
10998 		l.subcategory = subcategory;
10999 
11000 		/* Send this request to lower layers */
11001 		if (l.flags != 0) {
11002 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11003 			    sizeof(l), &l);
11004 		}
11005 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11006 		/*
11007 		 * If targeted to the lower layers without an output
11008 		 * control callback registered on the interface, just
11009 		 * silently ignore facilities other than ours.
11010 		 */
11011 		flags &= IFNET_LOGF_DLIL;
11012 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11013 			level = 0;
11014 		}
11015 	}
11016 
11017 	if (err == 0) {
11018 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11019 			ifp->if_log.flags = 0;
11020 		} else {
11021 			ifp->if_log.flags |= flags;
11022 		}
11023 
11024 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11025 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11026 		    ifp->if_log.level, ifp->if_log.flags,
11027 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11028 		    category, subcategory);
11029 	}
11030 
11031 	return err;
11032 }
11033 
11034 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11035 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11036     int32_t *category, int32_t *subcategory)
11037 {
11038 	if (level != NULL) {
11039 		*level = ifp->if_log.level;
11040 	}
11041 	if (flags != NULL) {
11042 		*flags = ifp->if_log.flags;
11043 	}
11044 	if (category != NULL) {
11045 		*category = ifp->if_log.category;
11046 	}
11047 	if (subcategory != NULL) {
11048 		*subcategory = ifp->if_log.subcategory;
11049 	}
11050 
11051 	return 0;
11052 }
11053 
11054 int
ifnet_notify_address(struct ifnet * ifp,int af)11055 ifnet_notify_address(struct ifnet *ifp, int af)
11056 {
11057 	struct ifnet_notify_address_params na;
11058 
11059 #if PF
11060 	(void) pf_ifaddr_hook(ifp);
11061 #endif /* PF */
11062 
11063 	if (ifp->if_output_ctl == NULL) {
11064 		return EOPNOTSUPP;
11065 	}
11066 
11067 	bzero(&na, sizeof(na));
11068 	na.address_family = (sa_family_t)af;
11069 
11070 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11071 	           sizeof(na), &na);
11072 }
11073 
11074 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11075 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11076 {
11077 	if (ifp == NULL || flowid == NULL) {
11078 		return EINVAL;
11079 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11080 	    !IF_FULLY_ATTACHED(ifp)) {
11081 		return ENXIO;
11082 	}
11083 
11084 	*flowid = ifp->if_flowhash;
11085 
11086 	return 0;
11087 }
11088 
11089 errno_t
ifnet_disable_output(struct ifnet * ifp)11090 ifnet_disable_output(struct ifnet *ifp)
11091 {
11092 	int err;
11093 
11094 	if (ifp == NULL) {
11095 		return EINVAL;
11096 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11097 	    !IF_FULLY_ATTACHED(ifp)) {
11098 		return ENXIO;
11099 	}
11100 
11101 	if ((err = ifnet_fc_add(ifp)) == 0) {
11102 		lck_mtx_lock_spin(&ifp->if_start_lock);
11103 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11104 		lck_mtx_unlock(&ifp->if_start_lock);
11105 	}
11106 	return err;
11107 }
11108 
11109 errno_t
ifnet_enable_output(struct ifnet * ifp)11110 ifnet_enable_output(struct ifnet *ifp)
11111 {
11112 	if (ifp == NULL) {
11113 		return EINVAL;
11114 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11115 	    !IF_FULLY_ATTACHED(ifp)) {
11116 		return ENXIO;
11117 	}
11118 
11119 	ifnet_start_common(ifp, TRUE);
11120 	return 0;
11121 }
11122 
11123 void
ifnet_flowadv(uint32_t flowhash)11124 ifnet_flowadv(uint32_t flowhash)
11125 {
11126 	struct ifnet_fc_entry *ifce;
11127 	struct ifnet *ifp;
11128 
11129 	ifce = ifnet_fc_get(flowhash);
11130 	if (ifce == NULL) {
11131 		return;
11132 	}
11133 
11134 	VERIFY(ifce->ifce_ifp != NULL);
11135 	ifp = ifce->ifce_ifp;
11136 
11137 	/* flow hash gets recalculated per attach, so check */
11138 	if (ifnet_is_attached(ifp, 1)) {
11139 		if (ifp->if_flowhash == flowhash) {
11140 			(void) ifnet_enable_output(ifp);
11141 		}
11142 		ifnet_decr_iorefcnt(ifp);
11143 	}
11144 	ifnet_fc_entry_free(ifce);
11145 }
11146 
11147 /*
11148  * Function to compare ifnet_fc_entries in ifnet flow control tree
11149  */
11150 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11151 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11152 {
11153 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11154 }
11155 
11156 static int
ifnet_fc_add(struct ifnet * ifp)11157 ifnet_fc_add(struct ifnet *ifp)
11158 {
11159 	struct ifnet_fc_entry keyfc, *ifce;
11160 	uint32_t flowhash;
11161 
11162 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11163 	VERIFY(ifp->if_flowhash != 0);
11164 	flowhash = ifp->if_flowhash;
11165 
11166 	bzero(&keyfc, sizeof(keyfc));
11167 	keyfc.ifce_flowhash = flowhash;
11168 
11169 	lck_mtx_lock_spin(&ifnet_fc_lock);
11170 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11171 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11172 		/* Entry is already in ifnet_fc_tree, return */
11173 		lck_mtx_unlock(&ifnet_fc_lock);
11174 		return 0;
11175 	}
11176 
11177 	if (ifce != NULL) {
11178 		/*
11179 		 * There is a different fc entry with the same flow hash
11180 		 * but different ifp pointer.  There can be a collision
11181 		 * on flow hash but the probability is low.  Let's just
11182 		 * avoid adding a second one when there is a collision.
11183 		 */
11184 		lck_mtx_unlock(&ifnet_fc_lock);
11185 		return EAGAIN;
11186 	}
11187 
11188 	/* become regular mutex */
11189 	lck_mtx_convert_spin(&ifnet_fc_lock);
11190 
11191 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11192 	ifce->ifce_flowhash = flowhash;
11193 	ifce->ifce_ifp = ifp;
11194 
11195 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11196 	lck_mtx_unlock(&ifnet_fc_lock);
11197 	return 0;
11198 }
11199 
11200 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11201 ifnet_fc_get(uint32_t flowhash)
11202 {
11203 	struct ifnet_fc_entry keyfc, *ifce;
11204 	struct ifnet *ifp;
11205 
11206 	bzero(&keyfc, sizeof(keyfc));
11207 	keyfc.ifce_flowhash = flowhash;
11208 
11209 	lck_mtx_lock_spin(&ifnet_fc_lock);
11210 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11211 	if (ifce == NULL) {
11212 		/* Entry is not present in ifnet_fc_tree, return */
11213 		lck_mtx_unlock(&ifnet_fc_lock);
11214 		return NULL;
11215 	}
11216 
11217 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11218 
11219 	VERIFY(ifce->ifce_ifp != NULL);
11220 	ifp = ifce->ifce_ifp;
11221 
11222 	/* become regular mutex */
11223 	lck_mtx_convert_spin(&ifnet_fc_lock);
11224 
11225 	if (!ifnet_is_attached(ifp, 0)) {
11226 		/*
11227 		 * This ifp is not attached or in the process of being
11228 		 * detached; just don't process it.
11229 		 */
11230 		ifnet_fc_entry_free(ifce);
11231 		ifce = NULL;
11232 	}
11233 	lck_mtx_unlock(&ifnet_fc_lock);
11234 
11235 	return ifce;
11236 }
11237 
11238 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11239 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11240 {
11241 	zfree(ifnet_fc_zone, ifce);
11242 }
11243 
11244 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11245 ifnet_calc_flowhash(struct ifnet *ifp)
11246 {
11247 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11248 	uint32_t flowhash = 0;
11249 
11250 	if (ifnet_flowhash_seed == 0) {
11251 		ifnet_flowhash_seed = RandomULong();
11252 	}
11253 
11254 	bzero(&fh, sizeof(fh));
11255 
11256 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11257 	fh.ifk_unit = ifp->if_unit;
11258 	fh.ifk_flags = ifp->if_flags;
11259 	fh.ifk_eflags = ifp->if_eflags;
11260 	fh.ifk_capabilities = ifp->if_capabilities;
11261 	fh.ifk_capenable = ifp->if_capenable;
11262 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11263 	fh.ifk_rand1 = RandomULong();
11264 	fh.ifk_rand2 = RandomULong();
11265 
11266 try_again:
11267 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11268 	if (flowhash == 0) {
11269 		/* try to get a non-zero flowhash */
11270 		ifnet_flowhash_seed = RandomULong();
11271 		goto try_again;
11272 	}
11273 
11274 	return flowhash;
11275 }
11276 
11277 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11278 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11279     uint16_t flags, uint8_t *data)
11280 {
11281 #pragma unused(flags)
11282 	int error = 0;
11283 
11284 	switch (family) {
11285 	case AF_INET:
11286 		if_inetdata_lock_exclusive(ifp);
11287 		if (IN_IFEXTRA(ifp) != NULL) {
11288 			if (len == 0) {
11289 				/* Allow clearing the signature */
11290 				IN_IFEXTRA(ifp)->netsig_len = 0;
11291 				bzero(IN_IFEXTRA(ifp)->netsig,
11292 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11293 				if_inetdata_lock_done(ifp);
11294 				break;
11295 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11296 				error = EINVAL;
11297 				if_inetdata_lock_done(ifp);
11298 				break;
11299 			}
11300 			IN_IFEXTRA(ifp)->netsig_len = len;
11301 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11302 		} else {
11303 			error = ENOMEM;
11304 		}
11305 		if_inetdata_lock_done(ifp);
11306 		break;
11307 
11308 	case AF_INET6:
11309 		if_inet6data_lock_exclusive(ifp);
11310 		if (IN6_IFEXTRA(ifp) != NULL) {
11311 			if (len == 0) {
11312 				/* Allow clearing the signature */
11313 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11314 				bzero(IN6_IFEXTRA(ifp)->netsig,
11315 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11316 				if_inet6data_lock_done(ifp);
11317 				break;
11318 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11319 				error = EINVAL;
11320 				if_inet6data_lock_done(ifp);
11321 				break;
11322 			}
11323 			IN6_IFEXTRA(ifp)->netsig_len = len;
11324 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11325 		} else {
11326 			error = ENOMEM;
11327 		}
11328 		if_inet6data_lock_done(ifp);
11329 		break;
11330 
11331 	default:
11332 		error = EINVAL;
11333 		break;
11334 	}
11335 
11336 	return error;
11337 }
11338 
11339 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11340 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11341     uint16_t *flags, uint8_t *data)
11342 {
11343 	int error = 0;
11344 
11345 	if (ifp == NULL || len == NULL || data == NULL) {
11346 		return EINVAL;
11347 	}
11348 
11349 	switch (family) {
11350 	case AF_INET:
11351 		if_inetdata_lock_shared(ifp);
11352 		if (IN_IFEXTRA(ifp) != NULL) {
11353 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11354 				error = EINVAL;
11355 				if_inetdata_lock_done(ifp);
11356 				break;
11357 			}
11358 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11359 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11360 			} else {
11361 				error = ENOENT;
11362 			}
11363 		} else {
11364 			error = ENOMEM;
11365 		}
11366 		if_inetdata_lock_done(ifp);
11367 		break;
11368 
11369 	case AF_INET6:
11370 		if_inet6data_lock_shared(ifp);
11371 		if (IN6_IFEXTRA(ifp) != NULL) {
11372 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11373 				error = EINVAL;
11374 				if_inet6data_lock_done(ifp);
11375 				break;
11376 			}
11377 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11378 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11379 			} else {
11380 				error = ENOENT;
11381 			}
11382 		} else {
11383 			error = ENOMEM;
11384 		}
11385 		if_inet6data_lock_done(ifp);
11386 		break;
11387 
11388 	default:
11389 		error = EINVAL;
11390 		break;
11391 	}
11392 
11393 	if (error == 0 && flags != NULL) {
11394 		*flags = 0;
11395 	}
11396 
11397 	return error;
11398 }
11399 
11400 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11401 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11402 {
11403 	int i, error = 0, one_set = 0;
11404 
11405 	if_inet6data_lock_exclusive(ifp);
11406 
11407 	if (IN6_IFEXTRA(ifp) == NULL) {
11408 		error = ENOMEM;
11409 		goto out;
11410 	}
11411 
11412 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11413 		uint32_t prefix_len =
11414 		    prefixes[i].prefix_len;
11415 		struct in6_addr *prefix =
11416 		    &prefixes[i].ipv6_prefix;
11417 
11418 		if (prefix_len == 0) {
11419 			clat_log0((LOG_DEBUG,
11420 			    "NAT64 prefixes purged from Interface %s\n",
11421 			    if_name(ifp)));
11422 			/* Allow clearing the signature */
11423 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11424 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11425 			    sizeof(struct in6_addr));
11426 
11427 			continue;
11428 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11429 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11430 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11431 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11432 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11433 		    prefix_len != NAT64_PREFIX_LEN_96) {
11434 			clat_log0((LOG_DEBUG,
11435 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11436 			error = EINVAL;
11437 			goto out;
11438 		}
11439 
11440 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11441 			clat_log0((LOG_DEBUG,
11442 			    "NAT64 prefix has interface/link local scope.\n"));
11443 			error = EINVAL;
11444 			goto out;
11445 		}
11446 
11447 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11448 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11449 		    sizeof(struct in6_addr));
11450 		clat_log0((LOG_DEBUG,
11451 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11452 		    ip6_sprintf(prefix), prefix_len));
11453 		one_set = 1;
11454 	}
11455 
11456 out:
11457 	if_inet6data_lock_done(ifp);
11458 
11459 	if (error == 0 && one_set != 0) {
11460 		necp_update_all_clients();
11461 	}
11462 
11463 	return error;
11464 }
11465 
11466 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11467 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11468 {
11469 	int i, found_one = 0, error = 0;
11470 
11471 	if (ifp == NULL) {
11472 		return EINVAL;
11473 	}
11474 
11475 	if_inet6data_lock_shared(ifp);
11476 
11477 	if (IN6_IFEXTRA(ifp) == NULL) {
11478 		error = ENOMEM;
11479 		goto out;
11480 	}
11481 
11482 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11483 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11484 			found_one = 1;
11485 		}
11486 	}
11487 
11488 	if (found_one == 0) {
11489 		error = ENOENT;
11490 		goto out;
11491 	}
11492 
11493 	if (prefixes) {
11494 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11495 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11496 	}
11497 
11498 out:
11499 	if_inet6data_lock_done(ifp);
11500 
11501 	return error;
11502 }
11503 
11504 __attribute__((noinline))
11505 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11506 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11507     protocol_family_t pf)
11508 {
11509 #pragma unused(ifp)
11510 	uint32_t did_sw;
11511 
11512 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11513 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11514 		return;
11515 	}
11516 
11517 	switch (pf) {
11518 	case PF_INET:
11519 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11520 		if (did_sw & CSUM_DELAY_IP) {
11521 			hwcksum_dbg_finalized_hdr++;
11522 		}
11523 		if (did_sw & CSUM_DELAY_DATA) {
11524 			hwcksum_dbg_finalized_data++;
11525 		}
11526 		break;
11527 	case PF_INET6:
11528 		/*
11529 		 * Checksum offload should not have been enabled when
11530 		 * extension headers exist; that also means that we
11531 		 * cannot force-finalize packets with extension headers.
11532 		 * Indicate to the callee should it skip such case by
11533 		 * setting optlen to -1.
11534 		 */
11535 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11536 		    m->m_pkthdr.csum_flags);
11537 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11538 			hwcksum_dbg_finalized_data++;
11539 		}
11540 		break;
11541 	default:
11542 		return;
11543 	}
11544 }
11545 
11546 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11547 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11548     protocol_family_t pf)
11549 {
11550 	uint16_t sum = 0;
11551 	uint32_t hlen;
11552 
11553 	if (frame_header == NULL ||
11554 	    frame_header < (char *)mbuf_datastart(m) ||
11555 	    frame_header > (char *)m->m_data) {
11556 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11557 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11558 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11559 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11560 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11561 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11562 		return;
11563 	}
11564 	hlen = (uint32_t)(m->m_data - frame_header);
11565 
11566 	switch (pf) {
11567 	case PF_INET:
11568 	case PF_INET6:
11569 		break;
11570 	default:
11571 		return;
11572 	}
11573 
11574 	/*
11575 	 * Force partial checksum offload; useful to simulate cases
11576 	 * where the hardware does not support partial checksum offload,
11577 	 * in order to validate correctness throughout the layers above.
11578 	 */
11579 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11580 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11581 
11582 		if (foff > (uint32_t)m->m_pkthdr.len) {
11583 			return;
11584 		}
11585 
11586 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11587 
11588 		/* Compute 16-bit 1's complement sum from forced offset */
11589 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11590 
11591 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11592 		m->m_pkthdr.csum_rx_val = sum;
11593 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11594 
11595 		hwcksum_dbg_partial_forced++;
11596 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11597 	}
11598 
11599 	/*
11600 	 * Partial checksum offload verification (and adjustment);
11601 	 * useful to validate and test cases where the hardware
11602 	 * supports partial checksum offload.
11603 	 */
11604 	if ((m->m_pkthdr.csum_flags &
11605 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11606 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11607 		uint32_t rxoff;
11608 
11609 		/* Start offset must begin after frame header */
11610 		rxoff = m->m_pkthdr.csum_rx_start;
11611 		if (hlen > rxoff) {
11612 			hwcksum_dbg_bad_rxoff++;
11613 			if (dlil_verbose) {
11614 				DLIL_PRINTF("%s: partial cksum start offset %d "
11615 				    "is less than frame header length %d for "
11616 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11617 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11618 			}
11619 			return;
11620 		}
11621 		rxoff -= hlen;
11622 
11623 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11624 			/*
11625 			 * Compute the expected 16-bit 1's complement sum;
11626 			 * skip this if we've already computed it above
11627 			 * when partial checksum offload is forced.
11628 			 */
11629 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11630 
11631 			/* Hardware or driver is buggy */
11632 			if (sum != m->m_pkthdr.csum_rx_val) {
11633 				hwcksum_dbg_bad_cksum++;
11634 				if (dlil_verbose) {
11635 					DLIL_PRINTF("%s: bad partial cksum value "
11636 					    "0x%x (expected 0x%x) for mbuf "
11637 					    "0x%llx [rx_start %d]\n",
11638 					    if_name(ifp),
11639 					    m->m_pkthdr.csum_rx_val, sum,
11640 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11641 					    m->m_pkthdr.csum_rx_start);
11642 				}
11643 				return;
11644 			}
11645 		}
11646 		hwcksum_dbg_verified++;
11647 
11648 		/*
11649 		 * This code allows us to emulate various hardwares that
11650 		 * perform 16-bit 1's complement sum beginning at various
11651 		 * start offset values.
11652 		 */
11653 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11654 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11655 
11656 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11657 				return;
11658 			}
11659 
11660 			sum = m_adj_sum16(m, rxoff, aoff,
11661 			    m_pktlen(m) - aoff, sum);
11662 
11663 			m->m_pkthdr.csum_rx_val = sum;
11664 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11665 
11666 			hwcksum_dbg_adjusted++;
11667 		}
11668 	}
11669 }
11670 
11671 static int
11672 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11673 {
11674 #pragma unused(arg1, arg2)
11675 	u_int32_t i;
11676 	int err;
11677 
11678 	i = hwcksum_dbg_mode;
11679 
11680 	err = sysctl_handle_int(oidp, &i, 0, req);
11681 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11682 		return err;
11683 	}
11684 
11685 	if (hwcksum_dbg == 0) {
11686 		return ENODEV;
11687 	}
11688 
11689 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
11690 		return EINVAL;
11691 	}
11692 
11693 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
11694 
11695 	return err;
11696 }
11697 
11698 static int
11699 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
11700 {
11701 #pragma unused(arg1, arg2)
11702 	u_int32_t i;
11703 	int err;
11704 
11705 	i = hwcksum_dbg_partial_rxoff_forced;
11706 
11707 	err = sysctl_handle_int(oidp, &i, 0, req);
11708 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11709 		return err;
11710 	}
11711 
11712 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11713 		return ENODEV;
11714 	}
11715 
11716 	hwcksum_dbg_partial_rxoff_forced = i;
11717 
11718 	return err;
11719 }
11720 
11721 static int
11722 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
11723 {
11724 #pragma unused(arg1, arg2)
11725 	u_int32_t i;
11726 	int err;
11727 
11728 	i = hwcksum_dbg_partial_rxoff_adj;
11729 
11730 	err = sysctl_handle_int(oidp, &i, 0, req);
11731 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11732 		return err;
11733 	}
11734 
11735 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
11736 		return ENODEV;
11737 	}
11738 
11739 	hwcksum_dbg_partial_rxoff_adj = i;
11740 
11741 	return err;
11742 }
11743 
11744 static int
11745 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
11746 {
11747 #pragma unused(oidp, arg1, arg2)
11748 	int err;
11749 
11750 	if (req->oldptr == USER_ADDR_NULL) {
11751 	}
11752 	if (req->newptr != USER_ADDR_NULL) {
11753 		return EPERM;
11754 	}
11755 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
11756 	    sizeof(struct chain_len_stats));
11757 
11758 	return err;
11759 }
11760 
11761 
11762 #if DEBUG || DEVELOPMENT
11763 /* Blob for sum16 verification */
11764 static uint8_t sumdata[] = {
11765 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11766 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11767 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11768 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11769 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11770 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11771 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11772 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11773 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11774 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11775 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11776 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11777 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11778 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11779 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11780 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11781 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11782 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11783 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11784 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11785 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11786 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11787 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11788 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11789 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11790 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11791 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11792 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11793 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11794 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11795 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11796 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11797 	0xc8, 0x28, 0x02, 0x00, 0x00
11798 };
11799 
11800 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11801 static struct {
11802 	boolean_t       init;
11803 	uint16_t        len;
11804 	uint16_t        sumr;   /* reference */
11805 	uint16_t        sumrp;  /* reference, precomputed */
11806 } sumtbl[] = {
11807 	{ FALSE, 0, 0, 0x0000 },
11808 	{ FALSE, 1, 0, 0x001f },
11809 	{ FALSE, 2, 0, 0x8b1f },
11810 	{ FALSE, 3, 0, 0x8b27 },
11811 	{ FALSE, 7, 0, 0x790e },
11812 	{ FALSE, 11, 0, 0xcb6d },
11813 	{ FALSE, 20, 0, 0x20dd },
11814 	{ FALSE, 27, 0, 0xbabd },
11815 	{ FALSE, 32, 0, 0xf3e8 },
11816 	{ FALSE, 37, 0, 0x197d },
11817 	{ FALSE, 43, 0, 0x9eae },
11818 	{ FALSE, 64, 0, 0x4678 },
11819 	{ FALSE, 127, 0, 0x9399 },
11820 	{ FALSE, 256, 0, 0xd147 },
11821 	{ FALSE, 325, 0, 0x0358 },
11822 };
11823 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11824 
11825 static void
dlil_verify_sum16(void)11826 dlil_verify_sum16(void)
11827 {
11828 	struct mbuf *m;
11829 	uint8_t *buf;
11830 	int n;
11831 
11832 	/* Make sure test data plus extra room for alignment fits in cluster */
11833 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11834 
11835 	kprintf("DLIL: running SUM16 self-tests ... ");
11836 
11837 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11838 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11839 
11840 	buf = mtod(m, uint8_t *);               /* base address */
11841 
11842 	for (n = 0; n < SUMTBL_MAX; n++) {
11843 		uint16_t len = sumtbl[n].len;
11844 		int i;
11845 
11846 		/* Verify for all possible alignments */
11847 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
11848 			uint16_t sum, sumr;
11849 			uint8_t *c;
11850 
11851 			/* Copy over test data to mbuf */
11852 			VERIFY(len <= sizeof(sumdata));
11853 			c = buf + i;
11854 			bcopy(sumdata, c, len);
11855 
11856 			/* Zero-offset test (align by data pointer) */
11857 			m->m_data = (caddr_t)c;
11858 			m->m_len = len;
11859 			sum = m_sum16(m, 0, len);
11860 
11861 			if (!sumtbl[n].init) {
11862 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11863 				sumtbl[n].sumr = sumr;
11864 				sumtbl[n].init = TRUE;
11865 			} else {
11866 				sumr = sumtbl[n].sumr;
11867 			}
11868 
11869 			/* Something is horribly broken; stop now */
11870 			if (sumr != sumtbl[n].sumrp) {
11871 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11872 				    "for len=%d align=%d sum=0x%04x "
11873 				    "[expected=0x%04x]\n", __func__,
11874 				    len, i, sum, sumr);
11875 				/* NOTREACHED */
11876 			} else if (sum != sumr) {
11877 				panic_plain("\n%s: broken m_sum16() for len=%d "
11878 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11879 				    __func__, len, i, sum, sumr);
11880 				/* NOTREACHED */
11881 			}
11882 
11883 			/* Alignment test by offset (fixed data pointer) */
11884 			m->m_data = (caddr_t)buf;
11885 			m->m_len = i + len;
11886 			sum = m_sum16(m, i, len);
11887 
11888 			/* Something is horribly broken; stop now */
11889 			if (sum != sumr) {
11890 				panic_plain("\n%s: broken m_sum16() for len=%d "
11891 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
11892 				    __func__, len, i, sum, sumr);
11893 				/* NOTREACHED */
11894 			}
11895 #if INET
11896 			/* Simple sum16 contiguous buffer test by aligment */
11897 			sum = b_sum16(c, len);
11898 
11899 			/* Something is horribly broken; stop now */
11900 			if (sum != sumr) {
11901 				panic_plain("\n%s: broken b_sum16() for len=%d "
11902 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11903 				    __func__, len, i, sum, sumr);
11904 				/* NOTREACHED */
11905 			}
11906 #endif /* INET */
11907 		}
11908 	}
11909 	m_freem(m);
11910 
11911 	kprintf("PASSED\n");
11912 }
11913 #endif /* DEBUG || DEVELOPMENT */
11914 
11915 #define CASE_STRINGIFY(x) case x: return #x
11916 
11917 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11918 dlil_kev_dl_code_str(u_int32_t event_code)
11919 {
11920 	switch (event_code) {
11921 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11922 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11923 		CASE_STRINGIFY(KEV_DL_SIFMTU);
11924 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
11925 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11926 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11927 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
11928 		CASE_STRINGIFY(KEV_DL_DELMULTI);
11929 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11930 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11931 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11932 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
11933 		CASE_STRINGIFY(KEV_DL_LINK_ON);
11934 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11935 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11936 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11937 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11938 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11939 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11940 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11941 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11942 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11943 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11944 		CASE_STRINGIFY(KEV_DL_ISSUES);
11945 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11946 	default:
11947 		break;
11948 	}
11949 	return "";
11950 }
11951 
11952 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11953 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11954 {
11955 #pragma unused(arg1)
11956 	struct ifnet *ifp = arg0;
11957 
11958 	if (ifnet_is_attached(ifp, 1)) {
11959 		nstat_ifnet_threshold_reached(ifp->if_index);
11960 		ifnet_decr_iorefcnt(ifp);
11961 	}
11962 }
11963 
11964 void
ifnet_notify_data_threshold(struct ifnet * ifp)11965 ifnet_notify_data_threshold(struct ifnet *ifp)
11966 {
11967 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11968 	uint64_t oldbytes = ifp->if_dt_bytes;
11969 
11970 	ASSERT(ifp->if_dt_tcall != NULL);
11971 
11972 	/*
11973 	 * If we went over the threshold, notify NetworkStatistics.
11974 	 * We rate-limit it based on the threshold interval value.
11975 	 */
11976 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
11977 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
11978 	    !thread_call_isactive(ifp->if_dt_tcall)) {
11979 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
11980 		uint64_t now = mach_absolute_time(), deadline = now;
11981 		uint64_t ival;
11982 
11983 		if (tival != 0) {
11984 			nanoseconds_to_absolutetime(tival, &ival);
11985 			clock_deadline_for_periodic_event(ival, now, &deadline);
11986 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
11987 			    deadline);
11988 		} else {
11989 			(void) thread_call_enter(ifp->if_dt_tcall);
11990 		}
11991 	}
11992 }
11993 
11994 #if (DEVELOPMENT || DEBUG)
11995 /*
11996  * The sysctl variable name contains the input parameters of
11997  * ifnet_get_keepalive_offload_frames()
11998  *  ifp (interface index): name[0]
11999  *  frames_array_count:    name[1]
12000  *  frame_data_offset:     name[2]
12001  * The return length gives used_frames_count
12002  */
12003 static int
12004 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12005 {
12006 #pragma unused(oidp)
12007 	int *name = (int *)arg1;
12008 	u_int namelen = arg2;
12009 	int idx;
12010 	ifnet_t ifp = NULL;
12011 	u_int32_t frames_array_count;
12012 	size_t frame_data_offset;
12013 	u_int32_t used_frames_count;
12014 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12015 	int error = 0;
12016 	u_int32_t i;
12017 
12018 	/*
12019 	 * Only root can get look at other people TCP frames
12020 	 */
12021 	error = proc_suser(current_proc());
12022 	if (error != 0) {
12023 		goto done;
12024 	}
12025 	/*
12026 	 * Validate the input parameters
12027 	 */
12028 	if (req->newptr != USER_ADDR_NULL) {
12029 		error = EPERM;
12030 		goto done;
12031 	}
12032 	if (namelen != 3) {
12033 		error = EINVAL;
12034 		goto done;
12035 	}
12036 	if (req->oldptr == USER_ADDR_NULL) {
12037 		error = EINVAL;
12038 		goto done;
12039 	}
12040 	if (req->oldlen == 0) {
12041 		error = EINVAL;
12042 		goto done;
12043 	}
12044 	idx = name[0];
12045 	frames_array_count = name[1];
12046 	frame_data_offset = name[2];
12047 
12048 	/* Make sure the passed buffer is large enough */
12049 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12050 	    req->oldlen) {
12051 		error = ENOMEM;
12052 		goto done;
12053 	}
12054 
12055 	ifnet_head_lock_shared();
12056 	if (!IF_INDEX_IN_RANGE(idx)) {
12057 		ifnet_head_done();
12058 		error = ENOENT;
12059 		goto done;
12060 	}
12061 	ifp = ifindex2ifnet[idx];
12062 	ifnet_head_done();
12063 
12064 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12065 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12066 		Z_WAITOK);
12067 	if (frames_array == NULL) {
12068 		error = ENOMEM;
12069 		goto done;
12070 	}
12071 
12072 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12073 	    frames_array_count, frame_data_offset, &used_frames_count);
12074 	if (error != 0) {
12075 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12076 		    __func__, error);
12077 		goto done;
12078 	}
12079 
12080 	for (i = 0; i < used_frames_count; i++) {
12081 		error = SYSCTL_OUT(req, frames_array + i,
12082 		    sizeof(struct ifnet_keepalive_offload_frame));
12083 		if (error != 0) {
12084 			goto done;
12085 		}
12086 	}
12087 done:
12088 	if (frames_array != NULL) {
12089 		kfree_data(frames_array, frames_array_count *
12090 		    sizeof(struct ifnet_keepalive_offload_frame));
12091 	}
12092 	return error;
12093 }
12094 #endif /* DEVELOPMENT || DEBUG */
12095 
12096 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12097 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12098     struct ifnet *ifp)
12099 {
12100 	tcp_update_stats_per_flow(ifs, ifp);
12101 }
12102 
12103 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12104 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12105 {
12106 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12107 }
12108 
12109 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12110 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12111 {
12112 	OSBitAndAtomic(~clear_flags, flags_p);
12113 }
12114 
12115 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12116 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12117 {
12118 	return _set_flags(&interface->if_eflags, set_flags);
12119 }
12120 
12121 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12122 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12123 {
12124 	_clear_flags(&interface->if_eflags, clear_flags);
12125 }
12126 
12127 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12128 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12129 {
12130 	return _set_flags(&interface->if_xflags, set_flags);
12131 }
12132 
12133 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12134 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12135 {
12136 	_clear_flags(&interface->if_xflags, clear_flags);
12137 }
12138 
12139 static void
log_hexdump(void * data,size_t len)12140 log_hexdump(void *data, size_t len)
12141 {
12142 	size_t i, j, k;
12143 	unsigned char *ptr = (unsigned char *)data;
12144 #define MAX_DUMP_BUF 32
12145 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12146 
12147 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12148 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12149 			unsigned char msnbl = ptr[j] >> 4;
12150 			unsigned char lsnbl = ptr[j] & 0x0f;
12151 
12152 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12153 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12154 
12155 			if ((j % 2) == 1) {
12156 				buf[k++] = ' ';
12157 			}
12158 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12159 				buf[k++] = ' ';
12160 			}
12161 		}
12162 		buf[k] = 0;
12163 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12164 	}
12165 }
12166 
12167 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
12168 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12169 net_check_compatible_if_filter(struct ifnet *ifp)
12170 {
12171 	if (ifp == NULL) {
12172 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12173 			return false;
12174 		}
12175 	} else {
12176 		if (ifp->if_flt_non_os_count > 0) {
12177 			return false;
12178 		}
12179 	}
12180 	return true;
12181 }
12182 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12183