xref: /xnu-8020.140.41/bsd/net/dlil.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 1999-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR        4 /* LONGWORDS */
154 
155 #if 1
156 #define DLIL_PRINTF     printf
157 #else
158 #define DLIL_PRINTF     kprintf
159 #endif
160 
161 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
162 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
163 
164 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
165 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
166 
167 enum {
168 	kProtoKPI_v1    = 1,
169 	kProtoKPI_v2    = 2
170 };
171 
172 /*
173  * List of if_proto structures in if_proto_hash[] is protected by
174  * the ifnet lock.  The rest of the fields are initialized at protocol
175  * attach time and never change, thus no lock required as long as
176  * a reference to it is valid, via if_proto_ref().
177  */
178 struct if_proto {
179 	SLIST_ENTRY(if_proto)       next_hash;
180 	u_int32_t                   refcount;
181 	u_int32_t                   detached;
182 	struct ifnet                *ifp;
183 	protocol_family_t           protocol_family;
184 	int                         proto_kpi;
185 	union {
186 		struct {
187 			proto_media_input               input;
188 			proto_media_preout              pre_output;
189 			proto_media_event               event;
190 			proto_media_ioctl               ioctl;
191 			proto_media_detached            detached;
192 			proto_media_resolve_multi       resolve_multi;
193 			proto_media_send_arp            send_arp;
194 		} v1;
195 		struct {
196 			proto_media_input_v2            input;
197 			proto_media_preout              pre_output;
198 			proto_media_event               event;
199 			proto_media_ioctl               ioctl;
200 			proto_media_detached            detached;
201 			proto_media_resolve_multi       resolve_multi;
202 			proto_media_send_arp            send_arp;
203 		} v2;
204 	} kpi;
205 };
206 
207 SLIST_HEAD(proto_hash_entry, if_proto);
208 
209 #define DLIL_SDLDATALEN \
210 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
211 
212 struct dlil_ifnet {
213 	struct ifnet    dl_if;                  /* public ifnet */
214 	/*
215 	 * DLIL private fields, protected by dl_if_lock
216 	 */
217 	decl_lck_mtx_data(, dl_if_lock);
218 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
219 	u_int32_t dl_if_flags;                  /* flags (below) */
220 	u_int32_t dl_if_refcnt;                 /* refcnt */
221 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
222 	void    *dl_if_uniqueid;                /* unique interface id */
223 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
224 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
225 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
226 	struct {
227 		struct ifaddr   ifa;            /* lladdr ifa */
228 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
229 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
230 	} dl_if_lladdr;
231 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
232 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
233 	u_int8_t dl_if_permanent_ether_is_set;
234 	u_int8_t dl_if_unused;
235 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
236 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
237 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
238 };
239 
240 /* Values for dl_if_flags (private to DLIL) */
241 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
242 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
243 #define DLIF_DEBUG      0x4     /* has debugging info */
244 
245 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
246 
247 /* For gdb */
248 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
249 
250 struct dlil_ifnet_dbg {
251 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
252 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
253 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
254 	/*
255 	 * Circular lists of ifnet_{reference,release} callers.
256 	 */
257 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
258 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
259 };
260 
261 #define DLIL_TO_IFP(s)  (&s->dl_if)
262 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
263 
264 struct ifnet_filter {
265 	TAILQ_ENTRY(ifnet_filter)       filt_next;
266 	u_int32_t                       filt_skip;
267 	u_int32_t                       filt_flags;
268 	ifnet_t                         filt_ifp;
269 	const char                      *filt_name;
270 	void                            *filt_cookie;
271 	protocol_family_t               filt_protocol;
272 	iff_input_func                  filt_input;
273 	iff_output_func                 filt_output;
274 	iff_event_func                  filt_event;
275 	iff_ioctl_func                  filt_ioctl;
276 	iff_detached_func               filt_detached;
277 };
278 
279 struct proto_input_entry;
280 
281 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
282 
283 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
284 
285 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
286 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
287 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
288 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
289 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
290 
291 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
292 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
293     &dlil_lck_attributes);
294 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
295     &dlil_lck_attributes);
296 
297 #if DEBUG
298 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
299 #else
300 static unsigned int ifnet_debug;        /* debugging (disabled) */
301 #endif /* !DEBUG */
302 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
303 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
304 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
305 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
306 
307 static ZONE_DEFINE(dlif_filt_zone, "ifnet_filter",
308     sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
309 
310 static ZONE_DEFINE(dlif_phash_zone, "ifnet_proto_hash",
311     sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
312 
313 static ZONE_DEFINE(dlif_proto_zone, "ifnet_proto",
314     sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
315 
316 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
317 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
318 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
319 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
320 
321 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
322 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
323 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
324 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
325 
326 static u_int32_t net_rtref;
327 
328 static struct dlil_main_threading_info dlil_main_input_thread_info;
329 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
330     (struct dlil_threading_info *)&dlil_main_input_thread_info;
331 
332 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
333 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
334 static void dlil_if_trace(struct dlil_ifnet *, int);
335 static void if_proto_ref(struct if_proto *);
336 static void if_proto_free(struct if_proto *);
337 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
338 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
339     u_int32_t list_count);
340 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
341 static void if_flt_monitor_busy(struct ifnet *);
342 static void if_flt_monitor_unbusy(struct ifnet *);
343 static void if_flt_monitor_enter(struct ifnet *);
344 static void if_flt_monitor_leave(struct ifnet *);
345 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
346     char **, protocol_family_t);
347 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
348     protocol_family_t);
349 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
350     const struct sockaddr_dl *);
351 static int ifnet_lookup(struct ifnet *);
352 static void if_purgeaddrs(struct ifnet *);
353 
354 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
355     struct mbuf *, char *);
356 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
357     struct mbuf *);
358 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
359     mbuf_t *, const struct sockaddr *, void *, char *, char *);
360 static void ifproto_media_event(struct ifnet *, protocol_family_t,
361     const struct kev_msg *);
362 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
363     unsigned long, void *);
364 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
365     struct sockaddr_dl *, size_t);
366 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
367     const struct sockaddr_dl *, const struct sockaddr *,
368     const struct sockaddr_dl *, const struct sockaddr *);
369 
370 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
371     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
372     boolean_t poll, struct thread *tp);
373 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
374     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
375 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
376 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
377     protocol_family_t *);
378 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
379     const struct ifnet_demux_desc *, u_int32_t);
380 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
381 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
382 #if !XNU_TARGET_OS_OSX
383 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
384     const struct sockaddr *, const char *, const char *,
385     u_int32_t *, u_int32_t *);
386 #else /* XNU_TARGET_OS_OSX */
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388     const struct sockaddr *, const char *, const char *);
389 #endif /* XNU_TARGET_OS_OSX */
390 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
391     const struct sockaddr *, const char *, const char *,
392     u_int32_t *, u_int32_t *);
393 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
394 static void ifp_if_free(struct ifnet *);
395 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
396 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
397 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
398 
399 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
400     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
401     boolean_t, struct thread *);
402 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
403     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
404     boolean_t, struct thread *);
405 
406 static void dlil_main_input_thread_func(void *, wait_result_t);
407 static void dlil_main_input_thread_cont(void *, wait_result_t);
408 
409 static void dlil_input_thread_func(void *, wait_result_t);
410 static void dlil_input_thread_cont(void *, wait_result_t);
411 
412 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
413 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
414 
415 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
416     thread_continue_t *);
417 static void dlil_terminate_input_thread(struct dlil_threading_info *);
418 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
419     struct dlil_threading_info *, struct ifnet *, boolean_t);
420 static boolean_t dlil_input_stats_sync(struct ifnet *,
421     struct dlil_threading_info *);
422 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
423     u_int32_t, ifnet_model_t, boolean_t);
424 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
425     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
426 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
427 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
428 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
429 #if DEBUG || DEVELOPMENT
430 static void dlil_verify_sum16(void);
431 #endif /* DEBUG || DEVELOPMENT */
432 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
433     protocol_family_t);
434 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
435     protocol_family_t);
436 
437 static void dlil_incr_pending_thread_count(void);
438 static void dlil_decr_pending_thread_count(void);
439 
440 static void ifnet_detacher_thread_func(void *, wait_result_t);
441 static void ifnet_detacher_thread_cont(void *, wait_result_t);
442 static void ifnet_detach_final(struct ifnet *);
443 static void ifnet_detaching_enqueue(struct ifnet *);
444 static struct ifnet *ifnet_detaching_dequeue(void);
445 
446 static void ifnet_start_thread_func(void *, wait_result_t);
447 static void ifnet_start_thread_cont(void *, wait_result_t);
448 
449 static void ifnet_poll_thread_func(void *, wait_result_t);
450 static void ifnet_poll_thread_cont(void *, wait_result_t);
451 
452 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
453     classq_pkt_t *, boolean_t, boolean_t *);
454 
455 static void ifp_src_route_copyout(struct ifnet *, struct route *);
456 static void ifp_src_route_copyin(struct ifnet *, struct route *);
457 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
458 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
459 
460 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
461 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
466 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
467 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
468 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
470 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
471 
472 struct chain_len_stats tx_chain_len_stats;
473 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
474 
475 #if TEST_INPUT_THREAD_TERMINATION
476 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
477 #endif /* TEST_INPUT_THREAD_TERMINATION */
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static ZONE_DEFINE(ifnet_fc_zone, "ifnet_fc_zone",
522     sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
523 
524 extern void bpfdetach(struct ifnet *);
525 extern void proto_input_run(void);
526 
527 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
528     u_int32_t flags);
529 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
530     u_int32_t flags);
531 
532 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
533 
534 #if CONFIG_MACF
535 #if !XNU_TARGET_OS_OSX
536 int dlil_lladdr_ckreq = 1;
537 #else /* XNU_TARGET_OS_OSX */
538 int dlil_lladdr_ckreq = 0;
539 #endif /* XNU_TARGET_OS_OSX */
540 #endif /* CONFIG_MACF */
541 
542 #if DEBUG
543 int dlil_verbose = 1;
544 #else
545 int dlil_verbose = 0;
546 #endif /* DEBUG */
547 #if IFNET_INPUT_SANITY_CHK
548 /* sanity checking of input packet lists received */
549 static u_int32_t dlil_input_sanity_check = 0;
550 #endif /* IFNET_INPUT_SANITY_CHK */
551 /* rate limit debug messages */
552 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
553 
554 SYSCTL_DECL(_net_link_generic_system);
555 
556 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
557     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
558 
559 #define IF_SNDQ_MINLEN  32
560 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
561 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
562     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
563     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
564 
565 #define IF_RCVQ_MINLEN  32
566 #define IF_RCVQ_MAXLEN  256
567 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
568 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
569     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
570     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
571 
572 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
573 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
574 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
575     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
576     "ilog2 of EWMA decay rate of avg inbound packets");
577 
578 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
579 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
580 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
581 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
582     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
583     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
584     "Q", "input poll mode freeze time");
585 
586 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
587 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
588 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
589 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
590     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
591     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
592     "Q", "input poll sampling time");
593 
594 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
595 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
596     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
597     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
598     "Q", "input poll interval (time)");
599 
600 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
601 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
602 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
603     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
604     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
605 
606 #define IF_RXPOLL_WLOWAT        10
607 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
608 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
609     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
610     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
611     "I", "input poll wakeup low watermark");
612 
613 #define IF_RXPOLL_WHIWAT        100
614 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
615 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
616     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
617     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
618     "I", "input poll wakeup high watermark");
619 
620 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
621 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
622     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
623     "max packets per poll call");
624 
625 u_int32_t if_rxpoll = 1;
626 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
627     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
628     sysctl_rxpoll, "I", "enable opportunistic input polling");
629 
630 #if TEST_INPUT_THREAD_TERMINATION
631 static u_int32_t if_input_thread_termination_spin = 0;
632 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
633     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
634     &if_input_thread_termination_spin, 0,
635     sysctl_input_thread_termination_spin,
636     "I", "input thread termination spin limit");
637 #endif /* TEST_INPUT_THREAD_TERMINATION */
638 
639 static u_int32_t cur_dlil_input_threads = 0;
640 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
641     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
642     "Current number of DLIL input threads");
643 
644 #if IFNET_INPUT_SANITY_CHK
645 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
646     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
647     "Turn on sanity checking in DLIL input");
648 #endif /* IFNET_INPUT_SANITY_CHK */
649 
650 static u_int32_t if_flowadv = 1;
651 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
652     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
653     "enable flow-advisory mechanism");
654 
655 static u_int32_t if_delaybased_queue = 1;
656 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
657     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
658     "enable delay based dynamic queue sizing");
659 
660 static uint64_t hwcksum_in_invalidated = 0;
661 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
662     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
663     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
664 
665 uint32_t hwcksum_dbg = 0;
666 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
667     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
668     "enable hardware cksum debugging");
669 
670 u_int32_t ifnet_start_delayed = 0;
671 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
672     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
673     "number of times start was delayed");
674 
675 u_int32_t ifnet_delay_start_disabled = 0;
676 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
677     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
678     "number of times start was delayed");
679 
680 static inline void
ifnet_delay_start_disabled_increment(void)681 ifnet_delay_start_disabled_increment(void)
682 {
683 	OSIncrementAtomic(&ifnet_delay_start_disabled);
684 }
685 
686 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
687 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
688 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
689 #define HWCKSUM_DBG_MASK \
690 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
691 	HWCKSUM_DBG_FINALIZE_FORCED)
692 
693 static uint32_t hwcksum_dbg_mode = 0;
694 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
695     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
696     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
697 
698 static uint64_t hwcksum_dbg_partial_forced = 0;
699 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
700     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
701     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
702 
703 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
704 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
705     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
706     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
707 
708 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
709 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
710     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
711     &hwcksum_dbg_partial_rxoff_forced, 0,
712     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
713     "forced partial cksum rx offset");
714 
715 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
716 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
717     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
718     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
719     "adjusted partial cksum rx offset");
720 
721 static uint64_t hwcksum_dbg_verified = 0;
722 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
723     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
724     &hwcksum_dbg_verified, "packets verified for having good checksum");
725 
726 static uint64_t hwcksum_dbg_bad_cksum = 0;
727 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
728     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
729     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
730 
731 static uint64_t hwcksum_dbg_bad_rxoff = 0;
732 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
733     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
734     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
735 
736 static uint64_t hwcksum_dbg_adjusted = 0;
737 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
738     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
739     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
740 
741 static uint64_t hwcksum_dbg_finalized_hdr = 0;
742 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
743     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
744     &hwcksum_dbg_finalized_hdr, "finalized headers");
745 
746 static uint64_t hwcksum_dbg_finalized_data = 0;
747 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
748     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
749     &hwcksum_dbg_finalized_data, "finalized payloads");
750 
751 uint32_t hwcksum_tx = 1;
752 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
753     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
754     "enable transmit hardware checksum offload");
755 
756 uint32_t hwcksum_rx = 1;
757 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
758     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
759     "enable receive hardware checksum offload");
760 
761 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
762     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
763     sysctl_tx_chain_len_stats, "S", "");
764 
765 uint32_t tx_chain_len_count = 0;
766 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
767     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
768 
769 static uint32_t threshold_notify = 1;           /* enable/disable */
770 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
771     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
772 
773 static uint32_t threshold_interval = 2;         /* in seconds */
774 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
775     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
776 
777 #if (DEVELOPMENT || DEBUG)
778 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
779 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
780     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
781 #endif /* DEVELOPMENT || DEBUG */
782 
783 struct net_api_stats net_api_stats;
784 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
785     &net_api_stats, net_api_stats, "");
786 
787 uint32_t net_wake_pkt_debug = 0;
788 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
789     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
790 
791 static void log_hexdump(void *data, size_t len);
792 
793 unsigned int net_rxpoll = 1;
794 unsigned int net_affinity = 1;
795 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
796 
797 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
798 
799 extern u_int32_t        inject_buckets;
800 
801 /* DLIL data threshold thread call */
802 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
803 
804 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)805 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
806 {
807 	/*
808 	 * update filter count and route_generation ID to let TCP
809 	 * know it should reevalute doing TSO or not
810 	 */
811 	if (filter_enable) {
812 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
813 	} else {
814 		VERIFY(ifp->if_flt_no_tso_count != 0);
815 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
816 	}
817 	routegenid_update();
818 }
819 
820 #if SKYWALK
821 
822 #if defined(XNU_TARGET_OS_OSX)
823 static bool net_check_compatible_if_filter(struct ifnet *ifp);
824 #endif /* XNU_TARGET_OS_OSX */
825 
826 /* if_attach_nx flags defined in os_skywalk_private.h */
827 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
828 unsigned int if_enable_fsw_ip_netagent =
829     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
830 unsigned int if_enable_fsw_transport_netagent =
831     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
832 
833 unsigned int if_netif_all =
834     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
835 
836 /* Configure flowswitch to use max mtu sized buffer */
837 static bool fsw_use_max_mtu_buffer = false;
838 
839 #if (DEVELOPMENT || DEBUG)
840 static int
841 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
842 {
843 #pragma unused(oidp, arg1, arg2)
844 	unsigned int new_value;
845 	int changed;
846 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
847 	    &new_value, &changed);
848 	if (error) {
849 		return error;
850 	}
851 	if (changed) {
852 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
853 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
854 			return ENOTSUP;
855 		}
856 		if_attach_nx = new_value;
857 	}
858 	return 0;
859 }
860 
861 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
862     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
863     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
864 
865 #endif /* DEVELOPMENT || DEBUG */
866 
867 static int
868 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
869 {
870 #pragma unused(oidp, arg1, arg2)
871 	unsigned int new_value;
872 	int changed;
873 	int error;
874 
875 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
876 	    sizeof(if_enable_fsw_transport_netagent),
877 	    &new_value, &changed);
878 	if (error == 0 && changed != 0) {
879 		if (new_value != 0 && new_value != 1) {
880 			/* only allow 0 or 1 */
881 			error = EINVAL;
882 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
883 			/* netagent can be enabled/disabled */
884 			if_enable_fsw_transport_netagent = new_value;
885 			if (new_value == 0) {
886 				kern_nexus_deregister_netagents();
887 			} else {
888 				kern_nexus_register_netagents();
889 			}
890 		} else {
891 			/* netagent can't be enabled */
892 			error = ENOTSUP;
893 		}
894 	}
895 	return error;
896 }
897 
898 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
899     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
900     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
901     "enable flowswitch netagent");
902 
903 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
904 
905 #include <skywalk/os_skywalk_private.h>
906 
907 boolean_t
ifnet_nx_noauto(ifnet_t ifp)908 ifnet_nx_noauto(ifnet_t ifp)
909 {
910 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
911 }
912 
913 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)914 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
915 {
916 	return ifnet_is_low_latency(ifp);
917 }
918 
919 boolean_t
ifnet_is_low_latency(ifnet_t ifp)920 ifnet_is_low_latency(ifnet_t ifp)
921 {
922 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
923 }
924 
925 boolean_t
ifnet_needs_compat(ifnet_t ifp)926 ifnet_needs_compat(ifnet_t ifp)
927 {
928 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
929 		return FALSE;
930 	}
931 #if !XNU_TARGET_OS_OSX
932 	/*
933 	 * To conserve memory, we plumb in the compat layer selectively; this
934 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
935 	 * In particular, we check for Wi-Fi Access Point.
936 	 */
937 	if (IFNET_IS_WIFI(ifp)) {
938 		/* Wi-Fi Access Point */
939 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
940 		    ifp->if_name[2] == '\0') {
941 			return if_netif_all;
942 		}
943 	}
944 #else /* XNU_TARGET_OS_OSX */
945 #pragma unused(ifp)
946 #endif /* XNU_TARGET_OS_OSX */
947 	return TRUE;
948 }
949 
950 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)951 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
952 {
953 	if (if_is_fsw_transport_netagent_enabled()) {
954 		/* check if netagent has been manually enabled for ipsec/utun */
955 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
956 			return ipsec_interface_needs_netagent(ifp);
957 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
958 			return utun_interface_needs_netagent(ifp);
959 		}
960 
961 		/* check ifnet no auto nexus override */
962 		if (ifnet_nx_noauto(ifp)) {
963 			return FALSE;
964 		}
965 
966 		/* check global if_attach_nx configuration */
967 		switch (ifp->if_family) {
968 		case IFNET_FAMILY_CELLULAR:
969 		case IFNET_FAMILY_ETHERNET:
970 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
971 				return TRUE;
972 			}
973 			break;
974 		default:
975 			break;
976 		}
977 	}
978 	return FALSE;
979 }
980 
981 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)982 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
983 {
984 #pragma unused(ifp)
985 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
986 		return TRUE;
987 	}
988 	return FALSE;
989 }
990 
991 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)992 ifnet_needs_netif_netagent(ifnet_t ifp)
993 {
994 #pragma unused(ifp)
995 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
996 }
997 
998 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)999 dlil_detach_nexus_instance(nexus_controller_t controller,
1000     const char *func_str, uuid_t instance, uuid_t device)
1001 {
1002 	errno_t         err;
1003 
1004 	if (instance == NULL || uuid_is_null(instance)) {
1005 		return FALSE;
1006 	}
1007 
1008 	/* followed by the device port */
1009 	if (device != NULL && !uuid_is_null(device)) {
1010 		err = kern_nexus_ifdetach(controller, instance, device);
1011 		if (err != 0) {
1012 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1013 			    func_str, err);
1014 		}
1015 	}
1016 	err = kern_nexus_controller_free_provider_instance(controller,
1017 	    instance);
1018 	if (err != 0) {
1019 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1020 		    func_str, err);
1021 	}
1022 	return TRUE;
1023 }
1024 
1025 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1026 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1027     uuid_t device)
1028 {
1029 	boolean_t               detached = FALSE;
1030 	nexus_controller_t      controller = kern_nexus_shared_controller();
1031 	int                     err;
1032 
1033 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1034 	    device)) {
1035 		detached = TRUE;
1036 	}
1037 	if (provider != NULL && !uuid_is_null(provider)) {
1038 		detached = TRUE;
1039 		err = kern_nexus_controller_deregister_provider(controller,
1040 		    provider);
1041 		if (err != 0) {
1042 			DLIL_PRINTF("%s deregister_provider %d\n",
1043 			    func_str, err);
1044 		}
1045 	}
1046 	return detached;
1047 }
1048 
1049 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1050 dlil_create_provider_and_instance(nexus_controller_t controller,
1051     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1052     nexus_attr_t attr)
1053 {
1054 	uuid_t          dom_prov;
1055 	errno_t         err;
1056 	nexus_name_t    provider_name;
1057 	const char      *type_name =
1058 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1059 	struct kern_nexus_init init;
1060 
1061 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1062 	if (err != 0) {
1063 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1064 		    __func__, type_name, err);
1065 		goto failed;
1066 	}
1067 
1068 	snprintf((char *)provider_name, sizeof(provider_name),
1069 	    "com.apple.%s.%s", type_name, if_name(ifp));
1070 	err = kern_nexus_controller_register_provider(controller,
1071 	    dom_prov,
1072 	    provider_name,
1073 	    NULL,
1074 	    0,
1075 	    attr,
1076 	    provider);
1077 	if (err != 0) {
1078 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1079 		    __func__, type_name, err);
1080 		goto failed;
1081 	}
1082 	bzero(&init, sizeof(init));
1083 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1084 	err = kern_nexus_controller_alloc_provider_instance(controller,
1085 	    *provider,
1086 	    NULL, NULL,
1087 	    instance, &init);
1088 	if (err != 0) {
1089 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1090 		    __func__, type_name, err);
1091 		kern_nexus_controller_deregister_provider(controller,
1092 		    *provider);
1093 		goto failed;
1094 	}
1095 failed:
1096 	return err;
1097 }
1098 
1099 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1100 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1101 {
1102 	nexus_attr_t            attr = NULL;
1103 	nexus_controller_t      controller;
1104 	errno_t                 err;
1105 
1106 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1107 		/* it's already attached */
1108 		if (dlil_verbose) {
1109 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1110 			    __func__, if_name(ifp));
1111 			/* already attached */
1112 		}
1113 		goto failed;
1114 	}
1115 
1116 	err = kern_nexus_attr_create(&attr);
1117 	if (err != 0) {
1118 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1119 		    if_name(ifp));
1120 		goto failed;
1121 	}
1122 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1123 	VERIFY(err == 0);
1124 
1125 	controller = kern_nexus_shared_controller();
1126 
1127 	/* create the netif provider and instance */
1128 	err = dlil_create_provider_and_instance(controller,
1129 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1130 	    &netif_nx->if_nif_instance, attr);
1131 	if (err != 0) {
1132 		goto failed;
1133 	}
1134 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1135 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1136 	if (err != 0) {
1137 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1138 		    __func__, err);
1139 		/* cleanup provider and instance */
1140 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1141 		    netif_nx->if_nif_instance, NULL);
1142 		goto failed;
1143 	}
1144 	return TRUE;
1145 
1146 failed:
1147 	if (attr != NULL) {
1148 		kern_nexus_attr_destroy(attr);
1149 	}
1150 	return FALSE;
1151 }
1152 
1153 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1154 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1155 {
1156 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1157 	    IFNET_IS_VMNET(ifp)) {
1158 		goto failed;
1159 	}
1160 	switch (ifp->if_type) {
1161 	case IFT_CELLULAR:
1162 	case IFT_ETHER:
1163 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1164 			/* don't auto-attach */
1165 			goto failed;
1166 		}
1167 		break;
1168 	default:
1169 		/* don't auto-attach */
1170 		goto failed;
1171 	}
1172 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1173 
1174 failed:
1175 	return FALSE;
1176 }
1177 
1178 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1179 dlil_is_native_netif_nexus(ifnet_t ifp)
1180 {
1181 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1182 }
1183 
1184 __attribute__((noinline))
1185 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1186 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1187 {
1188 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1189 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1190 }
1191 
1192 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1193 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1194 {
1195 	struct ifreq        ifr;
1196 	int                 error;
1197 
1198 	bzero(&ifr, sizeof(ifr));
1199 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1200 	if (error == 0) {
1201 		*ifdm_p = ifr.ifr_devmtu;
1202 	}
1203 	return error;
1204 }
1205 
1206 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint64_t * buf_size,bool * use_multi_buflet)1207 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint64_t *buf_size,
1208     bool *use_multi_buflet)
1209 {
1210 	struct kern_pbufpool_memory_info rx_pp_info;
1211 	struct kern_pbufpool_memory_info tx_pp_info;
1212 	uint32_t if_max_mtu = 0;
1213 	uint32_t drv_buf_size;
1214 	struct ifdevmtu ifdm;
1215 	int err;
1216 
1217 	/*
1218 	 * To perform intra-stack RX aggregation flowswitch needs to use
1219 	 * multi-buflet packet.
1220 	 */
1221 	*use_multi_buflet = (sk_fsw_rx_agg_tcp != 0);
1222 
1223 	/*
1224 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1225 	 * but the driver advertises the MAX MTU as only 9K.
1226 	 */
1227 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1228 		if_max_mtu = IP_MAXPACKET;
1229 		goto skip_mtu_ioctl;
1230 	}
1231 
1232 	/* determine max mtu */
1233 	bzero(&ifdm, sizeof(ifdm));
1234 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1235 	if (__improbable(err != 0)) {
1236 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1237 		    __func__, if_name(ifp));
1238 		/* use default flowswitch buffer size */
1239 		if_max_mtu = NX_FSW_BUFSIZE;
1240 	} else {
1241 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1242 		    ifdm.ifdm_max, ifdm.ifdm_current);
1243 		/* rdar://problem/44589731 */
1244 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1245 	}
1246 
1247 skip_mtu_ioctl:
1248 	if (if_max_mtu == 0) {
1249 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1250 		    __func__, if_name(ifp));
1251 		return EINVAL;
1252 	}
1253 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1254 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1255 		    "max bufsize(%d)\n", __func__,
1256 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1257 		return EINVAL;
1258 	}
1259 
1260 	/*
1261 	 * for skywalk native driver, consult the driver packet pool also.
1262 	 */
1263 	if (dlil_is_native_netif_nexus(ifp)) {
1264 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1265 		    &tx_pp_info);
1266 		if (err != 0) {
1267 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1268 			    __func__, if_name(ifp));
1269 			return ENXIO;
1270 		}
1271 		drv_buf_size = tx_pp_info.kpm_bufsize *
1272 		    tx_pp_info.kpm_max_frags;
1273 		if (if_max_mtu > drv_buf_size) {
1274 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1275 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1276 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1277 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1278 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1279 			return EINVAL;
1280 		}
1281 	} else {
1282 		drv_buf_size = if_max_mtu;
1283 	}
1284 
1285 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1286 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1287 		*use_multi_buflet = true;
1288 		/* default flowswitch buffer size */
1289 		*buf_size = NX_FSW_BUFSIZE;
1290 	} else {
1291 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1292 	}
1293 	return 0;
1294 }
1295 
1296 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1297 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1298 {
1299 	nexus_attr_t            attr = NULL;
1300 	nexus_controller_t      controller;
1301 	errno_t                 err = 0;
1302 	uuid_t                  netif;
1303 	uint64_t                buf_size = 0;
1304 	bool                    multi_buflet;
1305 
1306 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1307 	    IFNET_IS_VMNET(ifp)) {
1308 		goto failed;
1309 	}
1310 
1311 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1312 		/* not possible to attach (netif native/compat not plumbed) */
1313 		goto failed;
1314 	}
1315 
1316 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1317 		/* don't auto-attach */
1318 		goto failed;
1319 	}
1320 
1321 	/* get the netif instance from the ifp */
1322 	err = kern_nexus_get_netif_instance(ifp, netif);
1323 	if (err != 0) {
1324 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1325 		    if_name(ifp));
1326 		goto failed;
1327 	}
1328 
1329 	err = kern_nexus_attr_create(&attr);
1330 	if (err != 0) {
1331 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1332 		    if_name(ifp));
1333 		goto failed;
1334 	}
1335 
1336 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1337 	    &multi_buflet);
1338 	if (err != 0) {
1339 		goto failed;
1340 	}
1341 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1342 
1343 	/* Configure flowswitch buffer size */
1344 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1345 	VERIFY(err == 0);
1346 
1347 	/*
1348 	 * Configure flowswitch to use super-packet (multi-buflet).
1349 	 */
1350 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1351 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1352 	VERIFY(err == 0);
1353 
1354 	/* create the flowswitch provider and instance */
1355 	controller = kern_nexus_shared_controller();
1356 	err = dlil_create_provider_and_instance(controller,
1357 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1358 	    &nexus_fsw->if_fsw_instance, attr);
1359 	if (err != 0) {
1360 		goto failed;
1361 	}
1362 
1363 	/* attach the device port */
1364 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1365 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1366 	if (err != 0) {
1367 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1368 		    __func__, err, if_name(ifp));
1369 		/* cleanup provider and instance */
1370 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1371 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1372 		goto failed;
1373 	}
1374 	return TRUE;
1375 
1376 failed:
1377 	if (err != 0) {
1378 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1379 		    __func__, if_name(ifp), err);
1380 	} else {
1381 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1382 		    __func__, if_name(ifp));
1383 	}
1384 	if (attr != NULL) {
1385 		kern_nexus_attr_destroy(attr);
1386 	}
1387 	return FALSE;
1388 }
1389 
1390 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1391 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1392 {
1393 	boolean_t               attached;
1394 	if_nexus_flowswitch     nexus_fsw;
1395 
1396 #if (DEVELOPMENT || DEBUG)
1397 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1398 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1399 		return FALSE;
1400 	}
1401 #endif /* (DEVELOPMENT || DEBUG) */
1402 
1403 	/*
1404 	 * flowswitch attachment is not supported for interface using the
1405 	 * legacy model (IFNET_INIT_LEGACY)
1406 	 */
1407 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1408 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1409 		    if_name(ifp));
1410 		return FALSE;
1411 	}
1412 
1413 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1414 		/* it's already attached */
1415 		return FALSE;
1416 	}
1417 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1418 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1419 	if (attached) {
1420 		ifnet_lock_exclusive(ifp);
1421 		if (!IF_FULLY_ATTACHED(ifp)) {
1422 			/* interface is going away */
1423 			attached = FALSE;
1424 		} else {
1425 			ifp->if_nx_flowswitch = nexus_fsw;
1426 		}
1427 		ifnet_lock_done(ifp);
1428 		if (!attached) {
1429 			/* clean up flowswitch nexus */
1430 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1431 		}
1432 	}
1433 	return attached;
1434 }
1435 
1436 __attribute__((noinline))
1437 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1438 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1439 {
1440 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1441 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1442 }
1443 
1444 __attribute__((noinline))
1445 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1446 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1447 {
1448 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1449 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1450 
1451 	ifnet_datamov_suspend_and_drain(ifp);
1452 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1453 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1454 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1455 		dlil_detach_flowswitch_nexus(nx_fsw);
1456 		bzero(nx_fsw, sizeof(*nx_fsw));
1457 	} else {
1458 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1459 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1460 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1461 	}
1462 
1463 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1464 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1465 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1466 		dlil_detach_netif_nexus(nx_netif);
1467 		bzero(nx_netif, sizeof(*nx_netif));
1468 	} else {
1469 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1470 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1471 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1472 	}
1473 	ifnet_datamov_resume(ifp);
1474 }
1475 
1476 boolean_t
ifnet_add_netagent(ifnet_t ifp)1477 ifnet_add_netagent(ifnet_t ifp)
1478 {
1479 	int     error;
1480 
1481 	error = kern_nexus_interface_add_netagent(ifp);
1482 	os_log(OS_LOG_DEFAULT,
1483 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1484 	    ifp->if_xname, error);
1485 	return error == 0;
1486 }
1487 
1488 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1489 ifnet_remove_netagent(ifnet_t ifp)
1490 {
1491 	int     error;
1492 
1493 	error = kern_nexus_interface_remove_netagent(ifp);
1494 	os_log(OS_LOG_DEFAULT,
1495 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1496 	    ifp->if_xname, error);
1497 	return error == 0;
1498 }
1499 
1500 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1501 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1502 {
1503 	if (!IF_FULLY_ATTACHED(ifp)) {
1504 		return FALSE;
1505 	}
1506 	return dlil_attach_flowswitch_nexus(ifp);
1507 }
1508 
1509 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1510 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1511 {
1512 	if_nexus_flowswitch     nexus_fsw;
1513 
1514 	ifnet_lock_exclusive(ifp);
1515 	nexus_fsw = ifp->if_nx_flowswitch;
1516 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1517 	ifnet_lock_done(ifp);
1518 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1519 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1520 }
1521 
1522 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1523 ifnet_attach_netif_nexus(ifnet_t ifp)
1524 {
1525 	boolean_t       nexus_attached;
1526 	if_nexus_netif  nexus_netif;
1527 
1528 	if (!IF_FULLY_ATTACHED(ifp)) {
1529 		return FALSE;
1530 	}
1531 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1532 	if (nexus_attached) {
1533 		ifnet_lock_exclusive(ifp);
1534 		ifp->if_nx_netif = nexus_netif;
1535 		ifnet_lock_done(ifp);
1536 	}
1537 	return nexus_attached;
1538 }
1539 
1540 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1541 ifnet_detach_netif_nexus(ifnet_t ifp)
1542 {
1543 	if_nexus_netif  nexus_netif;
1544 
1545 	ifnet_lock_exclusive(ifp);
1546 	nexus_netif = ifp->if_nx_netif;
1547 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1548 	ifnet_lock_done(ifp);
1549 
1550 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1551 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1552 }
1553 
1554 #endif /* SKYWALK */
1555 
1556 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1557 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1558 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1559 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1560 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1561 	/* NOTREACHED */                                        \
1562 	}                                                               \
1563 }
1564 
1565 #define DLIL_EWMA(old, new, decay) do {                                 \
1566 	u_int32_t _avg;                                                 \
1567 	if ((_avg = (old)) > 0)                                         \
1568 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1569 	else                                                            \
1570 	        _avg = (new);                                           \
1571 	(old) = _avg;                                                   \
1572 } while (0)
1573 
1574 #define MBPS    (1ULL * 1000 * 1000)
1575 #define GBPS    (MBPS * 1000)
1576 
1577 struct rxpoll_time_tbl {
1578 	u_int64_t       speed;          /* downlink speed */
1579 	u_int32_t       plowat;         /* packets low watermark */
1580 	u_int32_t       phiwat;         /* packets high watermark */
1581 	u_int32_t       blowat;         /* bytes low watermark */
1582 	u_int32_t       bhiwat;         /* bytes high watermark */
1583 };
1584 
1585 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1586 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1587 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1588 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1589 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1590 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1591 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1592 };
1593 
1594 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1595     &dlil_lck_attributes);
1596 static uint32_t dlil_pending_thread_cnt = 0;
1597 
1598 static void
dlil_incr_pending_thread_count(void)1599 dlil_incr_pending_thread_count(void)
1600 {
1601 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1602 	lck_mtx_lock(&dlil_thread_sync_lock);
1603 	dlil_pending_thread_cnt++;
1604 	lck_mtx_unlock(&dlil_thread_sync_lock);
1605 }
1606 
1607 static void
dlil_decr_pending_thread_count(void)1608 dlil_decr_pending_thread_count(void)
1609 {
1610 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1611 	lck_mtx_lock(&dlil_thread_sync_lock);
1612 	VERIFY(dlil_pending_thread_cnt > 0);
1613 	dlil_pending_thread_cnt--;
1614 	if (dlil_pending_thread_cnt == 0) {
1615 		wakeup(&dlil_pending_thread_cnt);
1616 	}
1617 	lck_mtx_unlock(&dlil_thread_sync_lock);
1618 }
1619 
1620 int
proto_hash_value(u_int32_t protocol_family)1621 proto_hash_value(u_int32_t protocol_family)
1622 {
1623 	/*
1624 	 * dlil_proto_unplumb_all() depends on the mapping between
1625 	 * the hash bucket index and the protocol family defined
1626 	 * here; future changes must be applied there as well.
1627 	 */
1628 	switch (protocol_family) {
1629 	case PF_INET:
1630 		return 0;
1631 	case PF_INET6:
1632 		return 1;
1633 	case PF_VLAN:
1634 		return 2;
1635 	case PF_802154:
1636 		return 3;
1637 	case PF_UNSPEC:
1638 	default:
1639 		return 4;
1640 	}
1641 }
1642 
1643 /*
1644  * Caller must already be holding ifnet lock.
1645  */
1646 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1647 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1648 {
1649 	struct if_proto *proto = NULL;
1650 	u_int32_t i = proto_hash_value(protocol_family);
1651 
1652 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1653 
1654 	if (ifp->if_proto_hash != NULL) {
1655 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1656 	}
1657 
1658 	while (proto != NULL && proto->protocol_family != protocol_family) {
1659 		proto = SLIST_NEXT(proto, next_hash);
1660 	}
1661 
1662 	if (proto != NULL) {
1663 		if_proto_ref(proto);
1664 	}
1665 
1666 	return proto;
1667 }
1668 
1669 static void
if_proto_ref(struct if_proto * proto)1670 if_proto_ref(struct if_proto *proto)
1671 {
1672 	atomic_add_32(&proto->refcount, 1);
1673 }
1674 
1675 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1676 
1677 static void
if_proto_free(struct if_proto * proto)1678 if_proto_free(struct if_proto *proto)
1679 {
1680 	u_int32_t oldval;
1681 	struct ifnet *ifp = proto->ifp;
1682 	u_int32_t proto_family = proto->protocol_family;
1683 	struct kev_dl_proto_data ev_pr_data;
1684 
1685 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1686 	if (oldval > 1) {
1687 		return;
1688 	}
1689 
1690 	if (proto->proto_kpi == kProtoKPI_v1) {
1691 		if (proto->kpi.v1.detached) {
1692 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1693 		}
1694 	}
1695 	if (proto->proto_kpi == kProtoKPI_v2) {
1696 		if (proto->kpi.v2.detached) {
1697 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1698 		}
1699 	}
1700 
1701 	/*
1702 	 * Cleanup routes that may still be in the routing table for that
1703 	 * interface/protocol pair.
1704 	 */
1705 	if_rtproto_del(ifp, proto_family);
1706 
1707 	ifnet_lock_shared(ifp);
1708 
1709 	/* No more reference on this, protocol must have been detached */
1710 	VERIFY(proto->detached);
1711 
1712 	/*
1713 	 * The reserved field carries the number of protocol still attached
1714 	 * (subject to change)
1715 	 */
1716 	ev_pr_data.proto_family = proto_family;
1717 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1718 
1719 	ifnet_lock_done(ifp);
1720 
1721 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1722 	    (struct net_event_data *)&ev_pr_data,
1723 	    sizeof(struct kev_dl_proto_data), FALSE);
1724 
1725 	if (ev_pr_data.proto_remaining_count == 0) {
1726 		/*
1727 		 * The protocol count has gone to zero, mark the interface down.
1728 		 * This used to be done by configd.KernelEventMonitor, but that
1729 		 * is inherently prone to races (rdar://problem/30810208).
1730 		 */
1731 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1732 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1733 		dlil_post_sifflags_msg(ifp);
1734 	}
1735 
1736 	zfree(dlif_proto_zone, proto);
1737 }
1738 
1739 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1740 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1741 {
1742 #if !MACH_ASSERT
1743 #pragma unused(ifp)
1744 #endif
1745 	unsigned int type = 0;
1746 	int ass = 1;
1747 
1748 	switch (what) {
1749 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1750 		type = LCK_RW_ASSERT_EXCLUSIVE;
1751 		break;
1752 
1753 	case IFNET_LCK_ASSERT_SHARED:
1754 		type = LCK_RW_ASSERT_SHARED;
1755 		break;
1756 
1757 	case IFNET_LCK_ASSERT_OWNED:
1758 		type = LCK_RW_ASSERT_HELD;
1759 		break;
1760 
1761 	case IFNET_LCK_ASSERT_NOTOWNED:
1762 		/* nothing to do here for RW lock; bypass assert */
1763 		ass = 0;
1764 		break;
1765 
1766 	default:
1767 		panic("bad ifnet assert type: %d", what);
1768 		/* NOTREACHED */
1769 	}
1770 	if (ass) {
1771 		LCK_RW_ASSERT(&ifp->if_lock, type);
1772 	}
1773 }
1774 
1775 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1776 ifnet_lock_shared(struct ifnet *ifp)
1777 {
1778 	lck_rw_lock_shared(&ifp->if_lock);
1779 }
1780 
1781 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1782 ifnet_lock_exclusive(struct ifnet *ifp)
1783 {
1784 	lck_rw_lock_exclusive(&ifp->if_lock);
1785 }
1786 
1787 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1788 ifnet_lock_done(struct ifnet *ifp)
1789 {
1790 	lck_rw_done(&ifp->if_lock);
1791 }
1792 
1793 #if INET
1794 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1795 if_inetdata_lock_shared(struct ifnet *ifp)
1796 {
1797 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1798 }
1799 
1800 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1801 if_inetdata_lock_exclusive(struct ifnet *ifp)
1802 {
1803 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1804 }
1805 
1806 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1807 if_inetdata_lock_done(struct ifnet *ifp)
1808 {
1809 	lck_rw_done(&ifp->if_inetdata_lock);
1810 }
1811 #endif
1812 
1813 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1814 if_inet6data_lock_shared(struct ifnet *ifp)
1815 {
1816 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1817 }
1818 
1819 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1820 if_inet6data_lock_exclusive(struct ifnet *ifp)
1821 {
1822 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1823 }
1824 
1825 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1826 if_inet6data_lock_done(struct ifnet *ifp)
1827 {
1828 	lck_rw_done(&ifp->if_inet6data_lock);
1829 }
1830 
1831 __private_extern__ void
ifnet_head_lock_shared(void)1832 ifnet_head_lock_shared(void)
1833 {
1834 	lck_rw_lock_shared(&ifnet_head_lock);
1835 }
1836 
1837 __private_extern__ void
ifnet_head_lock_exclusive(void)1838 ifnet_head_lock_exclusive(void)
1839 {
1840 	lck_rw_lock_exclusive(&ifnet_head_lock);
1841 }
1842 
1843 __private_extern__ void
ifnet_head_done(void)1844 ifnet_head_done(void)
1845 {
1846 	lck_rw_done(&ifnet_head_lock);
1847 }
1848 
1849 __private_extern__ void
ifnet_head_assert_exclusive(void)1850 ifnet_head_assert_exclusive(void)
1851 {
1852 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1853 }
1854 
1855 /*
1856  * dlil_ifp_protolist
1857  * - get the list of protocols attached to the interface, or just the number
1858  *   of attached protocols
1859  * - if the number returned is greater than 'list_count', truncation occurred
1860  *
1861  * Note:
1862  * - caller must already be holding ifnet lock.
1863  */
1864 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1865 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1866     u_int32_t list_count)
1867 {
1868 	u_int32_t       count = 0;
1869 	int             i;
1870 
1871 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1872 
1873 	if (ifp->if_proto_hash == NULL) {
1874 		goto done;
1875 	}
1876 
1877 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1878 		struct if_proto *proto;
1879 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1880 			if (list != NULL && count < list_count) {
1881 				list[count] = proto->protocol_family;
1882 			}
1883 			count++;
1884 		}
1885 	}
1886 done:
1887 	return count;
1888 }
1889 
1890 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1891 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1892 {
1893 	ifnet_lock_shared(ifp);
1894 	count = dlil_ifp_protolist(ifp, protolist, count);
1895 	ifnet_lock_done(ifp);
1896 	return count;
1897 }
1898 
1899 __private_extern__ void
if_free_protolist(u_int32_t * list)1900 if_free_protolist(u_int32_t *list)
1901 {
1902 	kfree_data_addr(list);
1903 }
1904 
1905 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1906 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1907     u_int32_t event_code, struct net_event_data *event_data,
1908     u_int32_t event_data_len, boolean_t suppress_generation)
1909 {
1910 	struct net_event_data ev_data;
1911 	struct kev_msg ev_msg;
1912 
1913 	bzero(&ev_msg, sizeof(ev_msg));
1914 	bzero(&ev_data, sizeof(ev_data));
1915 	/*
1916 	 * a net event always starts with a net_event_data structure
1917 	 * but the caller can generate a simple net event or
1918 	 * provide a longer event structure to post
1919 	 */
1920 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1921 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1922 	ev_msg.kev_subclass     = event_subclass;
1923 	ev_msg.event_code       = event_code;
1924 
1925 	if (event_data == NULL) {
1926 		event_data = &ev_data;
1927 		event_data_len = sizeof(struct net_event_data);
1928 	}
1929 
1930 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1931 	event_data->if_family = ifp->if_family;
1932 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
1933 
1934 	ev_msg.dv[0].data_length = event_data_len;
1935 	ev_msg.dv[0].data_ptr    = event_data;
1936 	ev_msg.dv[1].data_length = 0;
1937 
1938 	bool update_generation = true;
1939 	if (event_subclass == KEV_DL_SUBCLASS) {
1940 		/* Don't update interface generation for frequent link quality and state changes  */
1941 		switch (event_code) {
1942 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1943 		case KEV_DL_RRC_STATE_CHANGED:
1944 		case KEV_DL_PRIMARY_ELECTED:
1945 			update_generation = false;
1946 			break;
1947 		default:
1948 			break;
1949 		}
1950 	}
1951 
1952 	/*
1953 	 * Some events that update generation counts might
1954 	 * want to suppress generation count.
1955 	 * One example is node presence/absence where we still
1956 	 * issue kernel event for the invocation but want to avoid
1957 	 * expensive operation of updating generation which triggers
1958 	 * NECP client updates.
1959 	 */
1960 	if (suppress_generation) {
1961 		update_generation = false;
1962 	}
1963 
1964 	return dlil_event_internal(ifp, &ev_msg, update_generation);
1965 }
1966 
1967 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1968 dlil_alloc_local_stats(struct ifnet *ifp)
1969 {
1970 	int ret = EINVAL;
1971 	void *buf, *base, **pbuf;
1972 
1973 	if (ifp == NULL) {
1974 		goto end;
1975 	}
1976 
1977 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1978 		/* allocate tcpstat_local structure */
1979 		buf = zalloc_flags(dlif_tcpstat_zone,
1980 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1981 
1982 		/* Get the 64-bit aligned base address for this object */
1983 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1984 		    sizeof(u_int64_t));
1985 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1986 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1987 
1988 		/*
1989 		 * Wind back a pointer size from the aligned base and
1990 		 * save the original address so we can free it later.
1991 		 */
1992 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1993 		*pbuf = buf;
1994 		ifp->if_tcp_stat = base;
1995 
1996 		/* allocate udpstat_local structure */
1997 		buf = zalloc_flags(dlif_udpstat_zone,
1998 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1999 
2000 		/* Get the 64-bit aligned base address for this object */
2001 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2002 		    sizeof(u_int64_t));
2003 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2004 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2005 
2006 		/*
2007 		 * Wind back a pointer size from the aligned base and
2008 		 * save the original address so we can free it later.
2009 		 */
2010 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2011 		*pbuf = buf;
2012 		ifp->if_udp_stat = base;
2013 
2014 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2015 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2016 
2017 		ret = 0;
2018 	}
2019 
2020 	if (ifp->if_ipv4_stat == NULL) {
2021 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2022 	}
2023 
2024 	if (ifp->if_ipv6_stat == NULL) {
2025 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2026 	}
2027 end:
2028 	if (ifp != NULL && ret != 0) {
2029 		if (ifp->if_tcp_stat != NULL) {
2030 			pbuf = (void **)
2031 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2032 			zfree(dlif_tcpstat_zone, *pbuf);
2033 			ifp->if_tcp_stat = NULL;
2034 		}
2035 		if (ifp->if_udp_stat != NULL) {
2036 			pbuf = (void **)
2037 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2038 			zfree(dlif_udpstat_zone, *pbuf);
2039 			ifp->if_udp_stat = NULL;
2040 		}
2041 		/* The macro kfree_type sets the passed pointer to NULL */
2042 		if (ifp->if_ipv4_stat != NULL) {
2043 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2044 		}
2045 		if (ifp->if_ipv6_stat != NULL) {
2046 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2047 		}
2048 	}
2049 
2050 	return ret;
2051 }
2052 
2053 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2054 dlil_reset_rxpoll_params(ifnet_t ifp)
2055 {
2056 	ASSERT(ifp != NULL);
2057 	ifnet_set_poll_cycle(ifp, NULL);
2058 	ifp->if_poll_update = 0;
2059 	ifp->if_poll_flags = 0;
2060 	ifp->if_poll_req = 0;
2061 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2062 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2063 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2064 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2065 	net_timerclear(&ifp->if_poll_mode_holdtime);
2066 	net_timerclear(&ifp->if_poll_mode_lasttime);
2067 	net_timerclear(&ifp->if_poll_sample_holdtime);
2068 	net_timerclear(&ifp->if_poll_sample_lasttime);
2069 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2070 }
2071 
2072 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2073 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2074     thread_continue_t *thfunc)
2075 {
2076 	boolean_t dlil_rxpoll_input;
2077 	thread_continue_t func = NULL;
2078 	u_int32_t limit;
2079 	int error = 0;
2080 
2081 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2082 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2083 
2084 	/* default strategy utilizes the DLIL worker thread */
2085 	inp->dlth_strategy = dlil_input_async;
2086 
2087 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2088 	if (ifp == NULL) {
2089 		/*
2090 		 * Main input thread only.
2091 		 */
2092 		func = dlil_main_input_thread_func;
2093 		VERIFY(inp == dlil_main_input_thread);
2094 		(void) strlcat(inp->dlth_name,
2095 		    "main_input", DLIL_THREADNAME_LEN);
2096 	} else if (dlil_rxpoll_input) {
2097 		/*
2098 		 * Legacy (non-netif) hybrid polling.
2099 		 */
2100 		func = dlil_rxpoll_input_thread_func;
2101 		VERIFY(inp != dlil_main_input_thread);
2102 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2103 		    "%s_input_poll", if_name(ifp));
2104 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2105 		/*
2106 		 * Asynchronous strategy.
2107 		 */
2108 		func = dlil_input_thread_func;
2109 		VERIFY(inp != dlil_main_input_thread);
2110 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2111 		    "%s_input", if_name(ifp));
2112 	} else {
2113 		/*
2114 		 * Synchronous strategy if there's a netif below and
2115 		 * the device isn't capable of hybrid polling.
2116 		 */
2117 		ASSERT(func == NULL);
2118 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2119 		VERIFY(inp != dlil_main_input_thread);
2120 		ASSERT(!inp->dlth_affinity);
2121 		inp->dlth_strategy = dlil_input_sync;
2122 	}
2123 	VERIFY(inp->dlth_thread == THREAD_NULL);
2124 
2125 	/* let caller know */
2126 	if (thfunc != NULL) {
2127 		*thfunc = func;
2128 	}
2129 
2130 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2131 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2132 
2133 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2134 	/*
2135 	 * For interfaces that support opportunistic polling, set the
2136 	 * low and high watermarks for outstanding inbound packets/bytes.
2137 	 * Also define freeze times for transitioning between modes
2138 	 * and updating the average.
2139 	 */
2140 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2141 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2142 		if (ifp->if_xflags & IFXF_LEGACY) {
2143 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2144 		}
2145 	} else {
2146 		limit = (u_int32_t)-1;
2147 	}
2148 
2149 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2150 	if (inp == dlil_main_input_thread) {
2151 		struct dlil_main_threading_info *inpm =
2152 		    (struct dlil_main_threading_info *)inp;
2153 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2154 	}
2155 
2156 	if (func == NULL) {
2157 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2158 		ASSERT(error == 0);
2159 		error = ENODEV;
2160 		goto done;
2161 	}
2162 
2163 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2164 	if (error == KERN_SUCCESS) {
2165 		thread_precedence_policy_data_t info;
2166 		__unused kern_return_t kret;
2167 
2168 		bzero(&info, sizeof(info));
2169 		info.importance = 0;
2170 		kret = thread_policy_set(inp->dlth_thread,
2171 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2172 		    THREAD_PRECEDENCE_POLICY_COUNT);
2173 		ASSERT(kret == KERN_SUCCESS);
2174 		/*
2175 		 * We create an affinity set so that the matching workloop
2176 		 * thread or the starter thread (for loopback) can be
2177 		 * scheduled on the same processor set as the input thread.
2178 		 */
2179 		if (net_affinity) {
2180 			struct thread *tp = inp->dlth_thread;
2181 			u_int32_t tag;
2182 			/*
2183 			 * Randomize to reduce the probability
2184 			 * of affinity tag namespace collision.
2185 			 */
2186 			read_frandom(&tag, sizeof(tag));
2187 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2188 				thread_reference(tp);
2189 				inp->dlth_affinity_tag = tag;
2190 				inp->dlth_affinity = TRUE;
2191 			}
2192 		}
2193 	} else if (inp == dlil_main_input_thread) {
2194 		panic_plain("%s: couldn't create main input thread", __func__);
2195 		/* NOTREACHED */
2196 	} else {
2197 		panic_plain("%s: couldn't create %s input thread", __func__,
2198 		    if_name(ifp));
2199 		/* NOTREACHED */
2200 	}
2201 	OSAddAtomic(1, &cur_dlil_input_threads);
2202 
2203 done:
2204 	return error;
2205 }
2206 
2207 #if TEST_INPUT_THREAD_TERMINATION
2208 static int
2209 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2210 {
2211 #pragma unused(arg1, arg2)
2212 	uint32_t i;
2213 	int err;
2214 
2215 	i = if_input_thread_termination_spin;
2216 
2217 	err = sysctl_handle_int(oidp, &i, 0, req);
2218 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2219 		return err;
2220 	}
2221 
2222 	if (net_rxpoll == 0) {
2223 		return ENXIO;
2224 	}
2225 
2226 	if_input_thread_termination_spin = i;
2227 	return err;
2228 }
2229 #endif /* TEST_INPUT_THREAD_TERMINATION */
2230 
2231 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2232 dlil_clean_threading_info(struct dlil_threading_info *inp)
2233 {
2234 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2235 	lck_grp_free(inp->dlth_lock_grp);
2236 	inp->dlth_lock_grp = NULL;
2237 
2238 	inp->dlth_flags = 0;
2239 	inp->dlth_wtot = 0;
2240 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2241 	inp->dlth_ifp = NULL;
2242 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2243 	qlimit(&inp->dlth_pkts) = 0;
2244 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2245 
2246 	VERIFY(!inp->dlth_affinity);
2247 	inp->dlth_thread = THREAD_NULL;
2248 	inp->dlth_strategy = NULL;
2249 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2250 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2251 	VERIFY(inp->dlth_affinity_tag == 0);
2252 #if IFNET_INPUT_SANITY_CHK
2253 	inp->dlth_pkts_cnt = 0;
2254 #endif /* IFNET_INPUT_SANITY_CHK */
2255 }
2256 
2257 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2258 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2259 {
2260 	struct ifnet *ifp = inp->dlth_ifp;
2261 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2262 
2263 	VERIFY(current_thread() == inp->dlth_thread);
2264 	VERIFY(inp != dlil_main_input_thread);
2265 
2266 	OSAddAtomic(-1, &cur_dlil_input_threads);
2267 
2268 #if TEST_INPUT_THREAD_TERMINATION
2269 	{ /* do something useless that won't get optimized away */
2270 		uint32_t        v = 1;
2271 		for (uint32_t i = 0;
2272 		    i < if_input_thread_termination_spin;
2273 		    i++) {
2274 			v = (i + 1) * v;
2275 		}
2276 		DLIL_PRINTF("the value is %d\n", v);
2277 	}
2278 #endif /* TEST_INPUT_THREAD_TERMINATION */
2279 
2280 	lck_mtx_lock_spin(&inp->dlth_lock);
2281 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2282 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2283 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2284 	wakeup_one((caddr_t)&inp->dlth_flags);
2285 	lck_mtx_unlock(&inp->dlth_lock);
2286 
2287 	/* free up pending packets */
2288 	if (pkt.cp_mbuf != NULL) {
2289 		mbuf_freem_list(pkt.cp_mbuf);
2290 	}
2291 
2292 	/* for the extra refcnt from kernel_thread_start() */
2293 	thread_deallocate(current_thread());
2294 
2295 	if (dlil_verbose) {
2296 		DLIL_PRINTF("%s: input thread terminated\n",
2297 		    if_name(ifp));
2298 	}
2299 
2300 	/* this is the end */
2301 	thread_terminate(current_thread());
2302 	/* NOTREACHED */
2303 }
2304 
2305 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2306 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2307 {
2308 	thread_affinity_policy_data_t policy;
2309 
2310 	bzero(&policy, sizeof(policy));
2311 	policy.affinity_tag = tag;
2312 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2313 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2314 }
2315 
2316 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2317 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2318 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2319     enum net_filter_event_subsystems state)
2320 {
2321 	if (state == 0) {
2322 		if_enable_fsw_transport_netagent = 1;
2323 	} else {
2324 		if_enable_fsw_transport_netagent = 0;
2325 	}
2326 	kern_nexus_update_netagents();
2327 }
2328 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2329 
2330 void
dlil_init(void)2331 dlil_init(void)
2332 {
2333 	thread_t thread = THREAD_NULL;
2334 
2335 	/*
2336 	 * The following fields must be 64-bit aligned for atomic operations.
2337 	 */
2338 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2339 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2340 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2341 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2342 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2343 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2344 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2345 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2346 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2347 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2348 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2349 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2350 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2351 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2352 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2353 
2354 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2355 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2356 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2357 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2358 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2359 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2360 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2361 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2362 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2363 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2364 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2365 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2366 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2367 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2368 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2369 
2370 	/*
2371 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2372 	 */
2373 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2374 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2375 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2376 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2377 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2378 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2379 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2380 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2381 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2382 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2383 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2384 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2385 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2386 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2387 
2388 	/*
2389 	 * ... as well as the mbuf checksum flags counterparts.
2390 	 */
2391 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2392 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2393 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2394 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2395 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2396 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2397 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2398 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2399 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2400 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2401 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2402 
2403 	/*
2404 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2405 	 */
2406 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2407 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2408 
2409 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2410 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2411 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2412 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2413 
2414 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2415 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2416 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2417 
2418 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2419 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2420 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2421 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2422 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2423 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2424 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2425 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2426 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2427 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2428 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2429 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2430 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2431 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2432 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2433 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2434 	_CASSERT(IFRTYPE_FAMILY_6LOWPAN == IFNET_FAMILY_6LOWPAN);
2435 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2436 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2437 
2438 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2439 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2440 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2441 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2442 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2443 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2444 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2445 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2446 	_CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2447 
2448 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2449 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2450 
2451 	PE_parse_boot_argn("net_affinity", &net_affinity,
2452 	    sizeof(net_affinity));
2453 
2454 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2455 
2456 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2457 
2458 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2459 
2460 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2461 
2462 	VERIFY(dlil_pending_thread_cnt == 0);
2463 #if SKYWALK
2464 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2465 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2466 	boolean_t enable_fsw_netagent =
2467 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2468 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2469 
2470 	/*
2471 	 * Check the device tree to see if Skywalk netagent has been explicitly
2472 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2473 	 * Note that the property is a 0-length key, and so checking for the
2474 	 * presence itself is enough (no need to check for the actual value of
2475 	 * the retrieved variable.)
2476 	 */
2477 	pe_enable_fsw_transport_netagent =
2478 	    PE_get_default("kern.skywalk_netagent_enable",
2479 	    &pe_enable_fsw_transport_netagent,
2480 	    sizeof(pe_enable_fsw_transport_netagent));
2481 	pe_disable_fsw_transport_netagent =
2482 	    PE_get_default("kern.skywalk_netagent_disable",
2483 	    &pe_disable_fsw_transport_netagent,
2484 	    sizeof(pe_disable_fsw_transport_netagent));
2485 
2486 	/*
2487 	 * These two are mutually exclusive, i.e. they both can be absent,
2488 	 * but only one can be present at a time, and so we assert to make
2489 	 * sure it is correct.
2490 	 */
2491 	VERIFY((!pe_enable_fsw_transport_netagent &&
2492 	    !pe_disable_fsw_transport_netagent) ||
2493 	    (pe_enable_fsw_transport_netagent ^
2494 	    pe_disable_fsw_transport_netagent));
2495 
2496 	if (pe_enable_fsw_transport_netagent) {
2497 		kprintf("SK: netagent is enabled via an override for "
2498 		    "this platform\n");
2499 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2500 	} else if (pe_disable_fsw_transport_netagent) {
2501 		kprintf("SK: netagent is disabled via an override for "
2502 		    "this platform\n");
2503 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2504 	} else {
2505 		kprintf("SK: netagent is %s by default for this platform\n",
2506 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2507 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2508 	}
2509 
2510 	/*
2511 	 * Now see if there's a boot-arg override.
2512 	 */
2513 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2514 	    sizeof(if_attach_nx));
2515 	if_enable_fsw_transport_netagent =
2516 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2517 
2518 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2519 
2520 	if (pe_disable_fsw_transport_netagent &&
2521 	    if_enable_fsw_transport_netagent) {
2522 		kprintf("SK: netagent is force-enabled\n");
2523 	} else if (!pe_disable_fsw_transport_netagent &&
2524 	    !if_enable_fsw_transport_netagent) {
2525 		kprintf("SK: netagent is force-disabled\n");
2526 	}
2527 #ifdef XNU_TARGET_OS_OSX
2528 	if (if_enable_fsw_transport_netagent) {
2529 		net_filter_event_register(dlil_filter_event);
2530 	}
2531 #endif /* XNU_TARGET_OS_OSX */
2532 
2533 #if (DEVELOPMENT || DEBUG)
2534 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2535 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2536 #endif /* (DEVELOPMENT || DEBUG) */
2537 
2538 #endif /* SKYWALK */
2539 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2540 	    sizeof(struct dlil_ifnet_dbg);
2541 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2542 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2543 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2544 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2545 
2546 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2547 	/* Enforce 64-bit alignment for tcpstat_local structure */
2548 	dlif_tcpstat_bufsize =
2549 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2550 	dlif_tcpstat_bufsize = (uint32_t)
2551 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2552 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2553 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2554 
2555 	dlif_udpstat_size = sizeof(struct udpstat_local);
2556 	/* Enforce 64-bit alignment for udpstat_local structure */
2557 	dlif_udpstat_bufsize =
2558 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2559 	dlif_udpstat_bufsize = (uint32_t)
2560 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2561 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2562 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2563 
2564 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2565 
2566 	TAILQ_INIT(&dlil_ifnet_head);
2567 	TAILQ_INIT(&ifnet_head);
2568 	TAILQ_INIT(&ifnet_detaching_head);
2569 	TAILQ_INIT(&ifnet_ordered_head);
2570 
2571 	/* Initialize interface address subsystem */
2572 	ifa_init();
2573 
2574 #if PF
2575 	/* Initialize the packet filter */
2576 	pfinit();
2577 #endif /* PF */
2578 
2579 	/* Initialize queue algorithms */
2580 	classq_init();
2581 
2582 	/* Initialize packet schedulers */
2583 	pktsched_init();
2584 
2585 	/* Initialize flow advisory subsystem */
2586 	flowadv_init();
2587 
2588 	/* Initialize the pktap virtual interface */
2589 	pktap_init();
2590 
2591 	/* Initialize the service class to dscp map */
2592 	net_qos_map_init();
2593 
2594 	/* Initialize the interface low power mode event handler */
2595 	if_low_power_evhdlr_init();
2596 
2597 	/* Initialize the interface offload port list subsystem */
2598 	if_ports_used_init();
2599 
2600 #if DEBUG || DEVELOPMENT
2601 	/* Run self-tests */
2602 	dlil_verify_sum16();
2603 #endif /* DEBUG || DEVELOPMENT */
2604 
2605 	/*
2606 	 * Create and start up the main DLIL input thread and the interface
2607 	 * detacher threads once everything is initialized.
2608 	 */
2609 	dlil_incr_pending_thread_count();
2610 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2611 
2612 	/*
2613 	 * Create ifnet detacher thread.
2614 	 * When an interface gets detached, part of the detach processing
2615 	 * is delayed. The interface is added to delayed detach list
2616 	 * and this thread is woken up to call ifnet_detach_final
2617 	 * on these interfaces.
2618 	 */
2619 	dlil_incr_pending_thread_count();
2620 	if (kernel_thread_start(ifnet_detacher_thread_func,
2621 	    NULL, &thread) != KERN_SUCCESS) {
2622 		panic_plain("%s: couldn't create detacher thread", __func__);
2623 		/* NOTREACHED */
2624 	}
2625 	thread_deallocate(thread);
2626 
2627 	/*
2628 	 * Wait for the created kernel threads for dlil to get
2629 	 * scheduled and run at least once before we proceed
2630 	 */
2631 	lck_mtx_lock(&dlil_thread_sync_lock);
2632 	while (dlil_pending_thread_cnt != 0) {
2633 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2634 		    "threads to get scheduled at least once.\n", __func__);
2635 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2636 		    (PZERO - 1), __func__, NULL);
2637 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2638 	}
2639 	lck_mtx_unlock(&dlil_thread_sync_lock);
2640 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2641 	    "scheduled at least once. Proceeding.\n", __func__);
2642 }
2643 
2644 static void
if_flt_monitor_busy(struct ifnet * ifp)2645 if_flt_monitor_busy(struct ifnet *ifp)
2646 {
2647 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2648 
2649 	++ifp->if_flt_busy;
2650 	VERIFY(ifp->if_flt_busy != 0);
2651 }
2652 
2653 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2654 if_flt_monitor_unbusy(struct ifnet *ifp)
2655 {
2656 	if_flt_monitor_leave(ifp);
2657 }
2658 
2659 static void
if_flt_monitor_enter(struct ifnet * ifp)2660 if_flt_monitor_enter(struct ifnet *ifp)
2661 {
2662 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2663 
2664 	while (ifp->if_flt_busy) {
2665 		++ifp->if_flt_waiters;
2666 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2667 		    (PZERO - 1), "if_flt_monitor", NULL);
2668 	}
2669 	if_flt_monitor_busy(ifp);
2670 }
2671 
2672 static void
if_flt_monitor_leave(struct ifnet * ifp)2673 if_flt_monitor_leave(struct ifnet *ifp)
2674 {
2675 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2676 
2677 	VERIFY(ifp->if_flt_busy != 0);
2678 	--ifp->if_flt_busy;
2679 
2680 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2681 		ifp->if_flt_waiters = 0;
2682 		wakeup(&ifp->if_flt_head);
2683 	}
2684 }
2685 
2686 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2687 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2688     interface_filter_t *filter_ref, u_int32_t flags)
2689 {
2690 	int retval = 0;
2691 	struct ifnet_filter *filter = NULL;
2692 
2693 	ifnet_head_lock_shared();
2694 
2695 	/* Check that the interface is in the global list */
2696 	if (!ifnet_lookup(ifp)) {
2697 		retval = ENXIO;
2698 		goto done;
2699 	}
2700 	if (!ifnet_is_attached(ifp, 1)) {
2701 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2702 		    __func__, if_name(ifp));
2703 		retval = ENXIO;
2704 		goto done;
2705 	}
2706 
2707 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2708 
2709 	/* refcnt held above during lookup */
2710 	filter->filt_flags = flags;
2711 	filter->filt_ifp = ifp;
2712 	filter->filt_cookie = if_filter->iff_cookie;
2713 	filter->filt_name = if_filter->iff_name;
2714 	filter->filt_protocol = if_filter->iff_protocol;
2715 	/*
2716 	 * Do not install filter callbacks for internal coproc interface
2717 	 */
2718 	if (!IFNET_IS_INTCOPROC(ifp)) {
2719 		filter->filt_input = if_filter->iff_input;
2720 		filter->filt_output = if_filter->iff_output;
2721 		filter->filt_event = if_filter->iff_event;
2722 		filter->filt_ioctl = if_filter->iff_ioctl;
2723 	}
2724 	filter->filt_detached = if_filter->iff_detached;
2725 
2726 	lck_mtx_lock(&ifp->if_flt_lock);
2727 	if_flt_monitor_enter(ifp);
2728 
2729 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2730 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2731 
2732 	*filter_ref = filter;
2733 
2734 	/*
2735 	 * Bump filter count and route_generation ID to let TCP
2736 	 * know it shouldn't do TSO on this connection
2737 	 */
2738 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2739 		ifnet_filter_update_tso(ifp, TRUE);
2740 	}
2741 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2742 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2743 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2744 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2745 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2746 	} else {
2747 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2748 	}
2749 	if_flt_monitor_leave(ifp);
2750 	lck_mtx_unlock(&ifp->if_flt_lock);
2751 
2752 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2753 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2754 	    net_check_compatible_if_filter(NULL));
2755 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2756 
2757 	if (dlil_verbose) {
2758 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2759 		    if_filter->iff_name);
2760 	}
2761 	ifnet_decr_iorefcnt(ifp);
2762 
2763 done:
2764 	ifnet_head_done();
2765 	if (retval != 0 && ifp != NULL) {
2766 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2767 		    if_name(ifp), if_filter->iff_name, retval);
2768 	}
2769 	if (retval != 0 && filter != NULL) {
2770 		zfree(dlif_filt_zone, filter);
2771 	}
2772 
2773 	return retval;
2774 }
2775 
2776 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2777 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2778 {
2779 	int retval = 0;
2780 
2781 	if (detached == 0) {
2782 		ifnet_t ifp = NULL;
2783 
2784 		ifnet_head_lock_shared();
2785 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2786 			interface_filter_t entry = NULL;
2787 
2788 			lck_mtx_lock(&ifp->if_flt_lock);
2789 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2790 				if (entry != filter || entry->filt_skip) {
2791 					continue;
2792 				}
2793 				/*
2794 				 * We've found a match; since it's possible
2795 				 * that the thread gets blocked in the monitor,
2796 				 * we do the lock dance.  Interface should
2797 				 * not be detached since we still have a use
2798 				 * count held during filter attach.
2799 				 */
2800 				entry->filt_skip = 1;   /* skip input/output */
2801 				lck_mtx_unlock(&ifp->if_flt_lock);
2802 				ifnet_head_done();
2803 
2804 				lck_mtx_lock(&ifp->if_flt_lock);
2805 				if_flt_monitor_enter(ifp);
2806 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2807 				    LCK_MTX_ASSERT_OWNED);
2808 
2809 				/* Remove the filter from the list */
2810 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2811 				    filt_next);
2812 
2813 				if (dlil_verbose) {
2814 					DLIL_PRINTF("%s: %s filter detached\n",
2815 					    if_name(ifp), filter->filt_name);
2816 				}
2817 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2818 					VERIFY(ifp->if_flt_non_os_count != 0);
2819 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2820 				}
2821 				/*
2822 				 * Decrease filter count and route_generation
2823 				 * ID to let TCP know it should reevalute doing
2824 				 * TSO or not.
2825 				 */
2826 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2827 					ifnet_filter_update_tso(ifp, FALSE);
2828 				}
2829 				if_flt_monitor_leave(ifp);
2830 				lck_mtx_unlock(&ifp->if_flt_lock);
2831 				goto destroy;
2832 			}
2833 			lck_mtx_unlock(&ifp->if_flt_lock);
2834 		}
2835 		ifnet_head_done();
2836 
2837 		/* filter parameter is not a valid filter ref */
2838 		retval = EINVAL;
2839 		goto done;
2840 	} else {
2841 		struct ifnet *ifp = filter->filt_ifp;
2842 		/*
2843 		 * Here we are called from ifnet_detach_final(); the
2844 		 * caller had emptied if_flt_head and we're doing an
2845 		 * implicit filter detach because the interface is
2846 		 * about to go away.  Make sure to adjust the counters
2847 		 * in this case.  We don't need the protection of the
2848 		 * filter monitor since we're called as part of the
2849 		 * final detach in the context of the detacher thread.
2850 		 */
2851 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2852 			VERIFY(ifp->if_flt_non_os_count != 0);
2853 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2854 		}
2855 		/*
2856 		 * Decrease filter count and route_generation
2857 		 * ID to let TCP know it should reevalute doing
2858 		 * TSO or not.
2859 		 */
2860 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2861 			ifnet_filter_update_tso(ifp, FALSE);
2862 		}
2863 	}
2864 
2865 	if (dlil_verbose) {
2866 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2867 	}
2868 
2869 destroy:
2870 
2871 	/* Call the detached function if there is one */
2872 	if (filter->filt_detached) {
2873 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2874 	}
2875 
2876 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2877 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2878 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2879 	}
2880 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2881 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2882 	    net_check_compatible_if_filter(NULL));
2883 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2884 
2885 	/* Free the filter */
2886 	zfree(dlif_filt_zone, filter);
2887 	filter = NULL;
2888 done:
2889 	if (retval != 0 && filter != NULL) {
2890 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2891 		    filter->filt_name, retval);
2892 	}
2893 
2894 	return retval;
2895 }
2896 
2897 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2898 dlil_detach_filter(interface_filter_t filter)
2899 {
2900 	if (filter == NULL) {
2901 		return;
2902 	}
2903 	dlil_detach_filter_internal(filter, 0);
2904 }
2905 
2906 __private_extern__ boolean_t
dlil_has_ip_filter(void)2907 dlil_has_ip_filter(void)
2908 {
2909 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2910 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2911 	return has_filter;
2912 }
2913 
2914 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2915 dlil_has_if_filter(struct ifnet *ifp)
2916 {
2917 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2918 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2919 	return has_filter;
2920 }
2921 
2922 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2923 dlil_input_wakeup(struct dlil_threading_info *inp)
2924 {
2925 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2926 
2927 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2928 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2929 		inp->dlth_wtot++;
2930 		wakeup_one((caddr_t)&inp->dlth_flags);
2931 	}
2932 }
2933 
2934 __attribute__((noreturn))
2935 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2936 dlil_main_input_thread_func(void *v, wait_result_t w)
2937 {
2938 #pragma unused(w)
2939 	struct dlil_threading_info *inp = v;
2940 
2941 	VERIFY(inp == dlil_main_input_thread);
2942 	VERIFY(inp->dlth_ifp == NULL);
2943 	VERIFY(current_thread() == inp->dlth_thread);
2944 
2945 	lck_mtx_lock(&inp->dlth_lock);
2946 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2947 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2948 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2949 	/* wake up once to get out of embryonic state */
2950 	dlil_input_wakeup(inp);
2951 	lck_mtx_unlock(&inp->dlth_lock);
2952 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2953 	/* NOTREACHED */
2954 	__builtin_unreachable();
2955 }
2956 
2957 /*
2958  * Main input thread:
2959  *
2960  *   a) handles all inbound packets for lo0
2961  *   b) handles all inbound packets for interfaces with no dedicated
2962  *	input thread (e.g. anything but Ethernet/PDP or those that support
2963  *	opportunistic polling.)
2964  *   c) protocol registrations
2965  *   d) packet injections
2966  */
2967 __attribute__((noreturn))
2968 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2969 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2970 {
2971 	struct dlil_main_threading_info *inpm = v;
2972 	struct dlil_threading_info *inp = v;
2973 
2974 	/* main input thread is uninterruptible */
2975 	VERIFY(wres != THREAD_INTERRUPTED);
2976 	lck_mtx_lock_spin(&inp->dlth_lock);
2977 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2978 	    DLIL_INPUT_RUNNING)));
2979 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2980 
2981 	while (1) {
2982 		struct mbuf *m = NULL, *m_loop = NULL;
2983 		u_int32_t m_cnt, m_cnt_loop;
2984 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2985 		boolean_t proto_req;
2986 		boolean_t embryonic;
2987 
2988 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2989 
2990 		if (__improbable(embryonic =
2991 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2992 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2993 		}
2994 
2995 		proto_req = (inp->dlth_flags &
2996 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2997 
2998 		/* Packets for non-dedicated interfaces other than lo0 */
2999 		m_cnt = qlen(&inp->dlth_pkts);
3000 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3001 		m = pkt.cp_mbuf;
3002 
3003 		/* Packets exclusive to lo0 */
3004 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3005 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3006 		m_loop = pkt.cp_mbuf;
3007 
3008 		inp->dlth_wtot = 0;
3009 
3010 		lck_mtx_unlock(&inp->dlth_lock);
3011 
3012 		if (__improbable(embryonic)) {
3013 			dlil_decr_pending_thread_count();
3014 		}
3015 
3016 		/*
3017 		 * NOTE warning %%% attention !!!!
3018 		 * We should think about putting some thread starvation
3019 		 * safeguards if we deal with long chains of packets.
3020 		 */
3021 		if (__probable(m_loop != NULL)) {
3022 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3023 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3024 		}
3025 
3026 		if (__probable(m != NULL)) {
3027 			dlil_input_packet_list_extended(NULL, m,
3028 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3029 		}
3030 
3031 		if (__improbable(proto_req)) {
3032 			proto_input_run();
3033 		}
3034 
3035 		lck_mtx_lock_spin(&inp->dlth_lock);
3036 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3037 		/* main input thread cannot be terminated */
3038 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3039 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3040 			break;
3041 		}
3042 	}
3043 
3044 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3045 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3046 	lck_mtx_unlock(&inp->dlth_lock);
3047 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3048 
3049 	VERIFY(0);      /* we should never get here */
3050 	/* NOTREACHED */
3051 	__builtin_unreachable();
3052 }
3053 
3054 /*
3055  * Input thread for interfaces with legacy input model.
3056  */
3057 __attribute__((noreturn))
3058 static void
dlil_input_thread_func(void * v,wait_result_t w)3059 dlil_input_thread_func(void *v, wait_result_t w)
3060 {
3061 #pragma unused(w)
3062 	char thread_name[MAXTHREADNAMESIZE];
3063 	struct dlil_threading_info *inp = v;
3064 	struct ifnet *ifp = inp->dlth_ifp;
3065 
3066 	VERIFY(inp != dlil_main_input_thread);
3067 	VERIFY(ifp != NULL);
3068 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3069 	    !(ifp->if_xflags & IFXF_LEGACY));
3070 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3071 	    !(ifp->if_xflags & IFXF_LEGACY));
3072 	VERIFY(current_thread() == inp->dlth_thread);
3073 
3074 	/* construct the name for this thread, and then apply it */
3075 	bzero(thread_name, sizeof(thread_name));
3076 	(void) snprintf(thread_name, sizeof(thread_name),
3077 	    "dlil_input_%s", ifp->if_xname);
3078 	thread_set_thread_name(inp->dlth_thread, thread_name);
3079 
3080 	lck_mtx_lock(&inp->dlth_lock);
3081 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3082 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3083 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3084 	/* wake up once to get out of embryonic state */
3085 	dlil_input_wakeup(inp);
3086 	lck_mtx_unlock(&inp->dlth_lock);
3087 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3088 	/* NOTREACHED */
3089 	__builtin_unreachable();
3090 }
3091 
3092 __attribute__((noreturn))
3093 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3094 dlil_input_thread_cont(void *v, wait_result_t wres)
3095 {
3096 	struct dlil_threading_info *inp = v;
3097 	struct ifnet *ifp = inp->dlth_ifp;
3098 
3099 	lck_mtx_lock_spin(&inp->dlth_lock);
3100 	if (__improbable(wres == THREAD_INTERRUPTED ||
3101 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3102 		goto terminate;
3103 	}
3104 
3105 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3106 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3107 
3108 	while (1) {
3109 		struct mbuf *m = NULL;
3110 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3111 		boolean_t notify = FALSE;
3112 		boolean_t embryonic;
3113 		u_int32_t m_cnt;
3114 
3115 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3116 
3117 		if (__improbable(embryonic =
3118 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3119 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3120 		}
3121 
3122 		/*
3123 		 * Protocol registration and injection must always use
3124 		 * the main input thread; in theory the latter can utilize
3125 		 * the corresponding input thread where the packet arrived
3126 		 * on, but that requires our knowing the interface in advance
3127 		 * (and the benefits might not worth the trouble.)
3128 		 */
3129 		VERIFY(!(inp->dlth_flags &
3130 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3131 
3132 		/* Packets for this interface */
3133 		m_cnt = qlen(&inp->dlth_pkts);
3134 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3135 		m = pkt.cp_mbuf;
3136 
3137 		inp->dlth_wtot = 0;
3138 
3139 #if SKYWALK
3140 		/*
3141 		 * If this interface is attached to a netif nexus,
3142 		 * the stats are already incremented there; otherwise
3143 		 * do it here.
3144 		 */
3145 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3146 #endif /* SKYWALK */
3147 		notify = dlil_input_stats_sync(ifp, inp);
3148 
3149 		lck_mtx_unlock(&inp->dlth_lock);
3150 
3151 		if (__improbable(embryonic)) {
3152 			ifnet_decr_pending_thread_count(ifp);
3153 		}
3154 
3155 		if (__improbable(notify)) {
3156 			ifnet_notify_data_threshold(ifp);
3157 		}
3158 
3159 		/*
3160 		 * NOTE warning %%% attention !!!!
3161 		 * We should think about putting some thread starvation
3162 		 * safeguards if we deal with long chains of packets.
3163 		 */
3164 		if (__probable(m != NULL)) {
3165 			dlil_input_packet_list_extended(NULL, m,
3166 			    m_cnt, ifp->if_poll_mode);
3167 		}
3168 
3169 		lck_mtx_lock_spin(&inp->dlth_lock);
3170 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3171 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3172 		    DLIL_INPUT_TERMINATE))) {
3173 			break;
3174 		}
3175 	}
3176 
3177 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3178 
3179 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3180 terminate:
3181 		lck_mtx_unlock(&inp->dlth_lock);
3182 		dlil_terminate_input_thread(inp);
3183 		/* NOTREACHED */
3184 	} else {
3185 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3186 		lck_mtx_unlock(&inp->dlth_lock);
3187 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3188 		/* NOTREACHED */
3189 	}
3190 
3191 	VERIFY(0);      /* we should never get here */
3192 	/* NOTREACHED */
3193 	__builtin_unreachable();
3194 }
3195 
3196 /*
3197  * Input thread for interfaces with opportunistic polling input model.
3198  */
3199 __attribute__((noreturn))
3200 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3201 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3202 {
3203 #pragma unused(w)
3204 	char thread_name[MAXTHREADNAMESIZE];
3205 	struct dlil_threading_info *inp = v;
3206 	struct ifnet *ifp = inp->dlth_ifp;
3207 
3208 	VERIFY(inp != dlil_main_input_thread);
3209 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3210 	    (ifp->if_xflags & IFXF_LEGACY));
3211 	VERIFY(current_thread() == inp->dlth_thread);
3212 
3213 	/* construct the name for this thread, and then apply it */
3214 	bzero(thread_name, sizeof(thread_name));
3215 	(void) snprintf(thread_name, sizeof(thread_name),
3216 	    "dlil_input_poll_%s", ifp->if_xname);
3217 	thread_set_thread_name(inp->dlth_thread, thread_name);
3218 
3219 	lck_mtx_lock(&inp->dlth_lock);
3220 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3221 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3222 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3223 	/* wake up once to get out of embryonic state */
3224 	dlil_input_wakeup(inp);
3225 	lck_mtx_unlock(&inp->dlth_lock);
3226 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3227 	/* NOTREACHED */
3228 	__builtin_unreachable();
3229 }
3230 
3231 __attribute__((noreturn))
3232 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3233 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3234 {
3235 	struct dlil_threading_info *inp = v;
3236 	struct ifnet *ifp = inp->dlth_ifp;
3237 	struct timespec ts;
3238 
3239 	lck_mtx_lock_spin(&inp->dlth_lock);
3240 	if (__improbable(wres == THREAD_INTERRUPTED ||
3241 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3242 		goto terminate;
3243 	}
3244 
3245 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3246 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3247 
3248 	while (1) {
3249 		struct mbuf *m = NULL;
3250 		uint32_t m_cnt, poll_req = 0;
3251 		uint64_t m_size = 0;
3252 		ifnet_model_t mode;
3253 		struct timespec now, delta;
3254 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3255 		boolean_t notify;
3256 		boolean_t embryonic;
3257 		uint64_t ival;
3258 
3259 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3260 
3261 		if (__improbable(embryonic =
3262 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3263 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3264 			goto skip;
3265 		}
3266 
3267 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3268 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3269 		}
3270 
3271 		/* Link parameters changed? */
3272 		if (ifp->if_poll_update != 0) {
3273 			ifp->if_poll_update = 0;
3274 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3275 		}
3276 
3277 		/* Current operating mode */
3278 		mode = ifp->if_poll_mode;
3279 
3280 		/*
3281 		 * Protocol registration and injection must always use
3282 		 * the main input thread; in theory the latter can utilize
3283 		 * the corresponding input thread where the packet arrived
3284 		 * on, but that requires our knowing the interface in advance
3285 		 * (and the benefits might not worth the trouble.)
3286 		 */
3287 		VERIFY(!(inp->dlth_flags &
3288 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3289 
3290 		/* Total count of all packets */
3291 		m_cnt = qlen(&inp->dlth_pkts);
3292 
3293 		/* Total bytes of all packets */
3294 		m_size = qsize(&inp->dlth_pkts);
3295 
3296 		/* Packets for this interface */
3297 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3298 		m = pkt.cp_mbuf;
3299 		VERIFY(m != NULL || m_cnt == 0);
3300 
3301 		nanouptime(&now);
3302 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3303 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3304 		}
3305 
3306 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3307 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3308 			u_int32_t ptot, btot;
3309 
3310 			/* Accumulate statistics for current sampling */
3311 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3312 
3313 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3314 				goto skip;
3315 			}
3316 
3317 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3318 
3319 			/* Calculate min/max of inbound bytes */
3320 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3321 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3322 				ifp->if_rxpoll_bmin = btot;
3323 			}
3324 			if (btot > ifp->if_rxpoll_bmax) {
3325 				ifp->if_rxpoll_bmax = btot;
3326 			}
3327 
3328 			/* Calculate EWMA of inbound bytes */
3329 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3330 
3331 			/* Calculate min/max of inbound packets */
3332 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3333 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3334 				ifp->if_rxpoll_pmin = ptot;
3335 			}
3336 			if (ptot > ifp->if_rxpoll_pmax) {
3337 				ifp->if_rxpoll_pmax = ptot;
3338 			}
3339 
3340 			/* Calculate EWMA of inbound packets */
3341 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3342 
3343 			/* Reset sampling statistics */
3344 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3345 
3346 			/* Calculate EWMA of wakeup requests */
3347 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3348 			    if_rxpoll_decay);
3349 			inp->dlth_wtot = 0;
3350 
3351 			if (dlil_verbose) {
3352 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3353 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3354 				}
3355 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3356 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3357 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3358 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3359 					    "limits [%d/%d], wreq avg %d "
3360 					    "limits [%d/%d], bytes avg %d "
3361 					    "limits [%d/%d]\n", if_name(ifp),
3362 					    (ifp->if_poll_mode ==
3363 					    IFNET_MODEL_INPUT_POLL_ON) ?
3364 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3365 					    ifp->if_rxpoll_pmax,
3366 					    ifp->if_rxpoll_plowat,
3367 					    ifp->if_rxpoll_phiwat,
3368 					    ifp->if_rxpoll_wavg,
3369 					    ifp->if_rxpoll_wlowat,
3370 					    ifp->if_rxpoll_whiwat,
3371 					    ifp->if_rxpoll_bavg,
3372 					    ifp->if_rxpoll_blowat,
3373 					    ifp->if_rxpoll_bhiwat);
3374 				}
3375 			}
3376 
3377 			/* Perform mode transition, if necessary */
3378 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3379 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3380 			}
3381 
3382 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3383 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3384 				goto skip;
3385 			}
3386 
3387 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3388 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3389 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3390 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3391 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3392 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3393 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3394 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3395 				mode = IFNET_MODEL_INPUT_POLL_ON;
3396 			}
3397 
3398 			if (mode != ifp->if_poll_mode) {
3399 				ifp->if_poll_mode = mode;
3400 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3401 				poll_req++;
3402 			}
3403 		}
3404 skip:
3405 		notify = dlil_input_stats_sync(ifp, inp);
3406 
3407 		lck_mtx_unlock(&inp->dlth_lock);
3408 
3409 		if (__improbable(embryonic)) {
3410 			ifnet_decr_pending_thread_count(ifp);
3411 		}
3412 
3413 		if (__improbable(notify)) {
3414 			ifnet_notify_data_threshold(ifp);
3415 		}
3416 
3417 		/*
3418 		 * If there's a mode change and interface is still attached,
3419 		 * perform a downcall to the driver for the new mode.  Also
3420 		 * hold an IO refcnt on the interface to prevent it from
3421 		 * being detached (will be release below.)
3422 		 */
3423 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3424 			struct ifnet_model_params p = {
3425 				.model = mode, .reserved = { 0 }
3426 			};
3427 			errno_t err;
3428 
3429 			if (dlil_verbose) {
3430 				DLIL_PRINTF("%s: polling is now %s, "
3431 				    "pkts avg %d max %d limits [%d/%d], "
3432 				    "wreq avg %d limits [%d/%d], "
3433 				    "bytes avg %d limits [%d/%d]\n",
3434 				    if_name(ifp),
3435 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3436 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3437 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3438 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3439 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3440 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3441 				    ifp->if_rxpoll_bhiwat);
3442 			}
3443 
3444 			if ((err = ((*ifp->if_input_ctl)(ifp,
3445 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3446 				DLIL_PRINTF("%s: error setting polling mode "
3447 				    "to %s (%d)\n", if_name(ifp),
3448 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3449 				    "ON" : "OFF", err);
3450 			}
3451 
3452 			switch (mode) {
3453 			case IFNET_MODEL_INPUT_POLL_OFF:
3454 				ifnet_set_poll_cycle(ifp, NULL);
3455 				ifp->if_rxpoll_offreq++;
3456 				if (err != 0) {
3457 					ifp->if_rxpoll_offerr++;
3458 				}
3459 				break;
3460 
3461 			case IFNET_MODEL_INPUT_POLL_ON:
3462 				net_nsectimer(&ival, &ts);
3463 				ifnet_set_poll_cycle(ifp, &ts);
3464 				ifnet_poll(ifp);
3465 				ifp->if_rxpoll_onreq++;
3466 				if (err != 0) {
3467 					ifp->if_rxpoll_onerr++;
3468 				}
3469 				break;
3470 
3471 			default:
3472 				VERIFY(0);
3473 				/* NOTREACHED */
3474 			}
3475 
3476 			/* Release the IO refcnt */
3477 			ifnet_decr_iorefcnt(ifp);
3478 		}
3479 
3480 		/*
3481 		 * NOTE warning %%% attention !!!!
3482 		 * We should think about putting some thread starvation
3483 		 * safeguards if we deal with long chains of packets.
3484 		 */
3485 		if (__probable(m != NULL)) {
3486 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3487 		}
3488 
3489 		lck_mtx_lock_spin(&inp->dlth_lock);
3490 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3491 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3492 		    DLIL_INPUT_TERMINATE))) {
3493 			break;
3494 		}
3495 	}
3496 
3497 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3498 
3499 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3500 terminate:
3501 		lck_mtx_unlock(&inp->dlth_lock);
3502 		dlil_terminate_input_thread(inp);
3503 		/* NOTREACHED */
3504 	} else {
3505 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3506 		lck_mtx_unlock(&inp->dlth_lock);
3507 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3508 		    inp);
3509 		/* NOTREACHED */
3510 	}
3511 
3512 	VERIFY(0);      /* we should never get here */
3513 	/* NOTREACHED */
3514 	__builtin_unreachable();
3515 }
3516 
3517 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3518 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3519 {
3520 	if (p != NULL) {
3521 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3522 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3523 			return EINVAL;
3524 		}
3525 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3526 		    p->packets_lowat >= p->packets_hiwat) {
3527 			return EINVAL;
3528 		}
3529 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3530 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3531 			return EINVAL;
3532 		}
3533 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3534 		    p->bytes_lowat >= p->bytes_hiwat) {
3535 			return EINVAL;
3536 		}
3537 		if (p->interval_time != 0 &&
3538 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3539 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3540 		}
3541 	}
3542 	return 0;
3543 }
3544 
3545 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3546 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3547 {
3548 	u_int64_t sample_holdtime, inbw;
3549 
3550 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3551 		sample_holdtime = 0;    /* polling is disabled */
3552 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3553 		    ifp->if_rxpoll_blowat = 0;
3554 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3555 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3556 		ifp->if_rxpoll_plim = 0;
3557 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3558 	} else {
3559 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3560 		u_int64_t ival;
3561 		unsigned int n, i;
3562 
3563 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3564 			if (inbw < rxpoll_tbl[i].speed) {
3565 				break;
3566 			}
3567 			n = i;
3568 		}
3569 		/* auto-tune if caller didn't specify a value */
3570 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3571 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3572 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3573 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3574 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3575 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3576 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3577 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3578 		plim = ((p == NULL || p->packets_limit == 0 ||
3579 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3580 		ival = ((p == NULL || p->interval_time == 0 ||
3581 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3582 		    if_rxpoll_interval_time : p->interval_time);
3583 
3584 		VERIFY(plowat != 0 && phiwat != 0);
3585 		VERIFY(blowat != 0 && bhiwat != 0);
3586 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3587 
3588 		sample_holdtime = if_rxpoll_sample_holdtime;
3589 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3590 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3591 		ifp->if_rxpoll_plowat = plowat;
3592 		ifp->if_rxpoll_phiwat = phiwat;
3593 		ifp->if_rxpoll_blowat = blowat;
3594 		ifp->if_rxpoll_bhiwat = bhiwat;
3595 		ifp->if_rxpoll_plim = plim;
3596 		ifp->if_rxpoll_ival = ival;
3597 	}
3598 
3599 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3600 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3601 
3602 	if (dlil_verbose) {
3603 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3604 		    "poll interval %llu nsec, pkts per poll %u, "
3605 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3606 		    "bytes limits [%u/%u]\n", if_name(ifp),
3607 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3608 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3609 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3610 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3611 		    ifp->if_rxpoll_bhiwat);
3612 	}
3613 }
3614 
3615 /*
3616  * Must be called on an attached ifnet (caller is expected to check.)
3617  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3618  */
3619 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3620 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3621     boolean_t locked)
3622 {
3623 	errno_t err;
3624 	struct dlil_threading_info *inp;
3625 
3626 	VERIFY(ifp != NULL);
3627 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3628 		return ENXIO;
3629 	}
3630 	err = dlil_rxpoll_validate_params(p);
3631 	if (err != 0) {
3632 		return err;
3633 	}
3634 
3635 	if (!locked) {
3636 		lck_mtx_lock(&inp->dlth_lock);
3637 	}
3638 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3639 	/*
3640 	 * Normally, we'd reset the parameters to the auto-tuned values
3641 	 * if the the input thread detects a change in link rate.  If the
3642 	 * driver provides its own parameters right after a link rate
3643 	 * changes, but before the input thread gets to run, we want to
3644 	 * make sure to keep the driver's values.  Clearing if_poll_update
3645 	 * will achieve that.
3646 	 */
3647 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3648 		ifp->if_poll_update = 0;
3649 	}
3650 	dlil_rxpoll_update_params(ifp, p);
3651 	if (!locked) {
3652 		lck_mtx_unlock(&inp->dlth_lock);
3653 	}
3654 	return 0;
3655 }
3656 
3657 /*
3658  * Must be called on an attached ifnet (caller is expected to check.)
3659  */
3660 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3661 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3662 {
3663 	struct dlil_threading_info *inp;
3664 
3665 	VERIFY(ifp != NULL && p != NULL);
3666 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3667 		return ENXIO;
3668 	}
3669 
3670 	bzero(p, sizeof(*p));
3671 
3672 	lck_mtx_lock(&inp->dlth_lock);
3673 	p->packets_limit = ifp->if_rxpoll_plim;
3674 	p->packets_lowat = ifp->if_rxpoll_plowat;
3675 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3676 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3677 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3678 	p->interval_time = ifp->if_rxpoll_ival;
3679 	lck_mtx_unlock(&inp->dlth_lock);
3680 
3681 	return 0;
3682 }
3683 
3684 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3685 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3686     const struct ifnet_stat_increment_param *s)
3687 {
3688 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3689 }
3690 
3691 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3692 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3693     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3694 {
3695 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3696 }
3697 
3698 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3699 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3700     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3701 {
3702 	return ifnet_input_common(ifp, m_head, m_tail, s,
3703 	           (m_head != NULL), TRUE);
3704 }
3705 
3706 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3707 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3708     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3709 {
3710 	dlil_input_func input_func;
3711 	struct ifnet_stat_increment_param _s;
3712 	u_int32_t m_cnt = 0, m_size = 0;
3713 	struct mbuf *last;
3714 	errno_t err = 0;
3715 
3716 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3717 		if (m_head != NULL) {
3718 			mbuf_freem_list(m_head);
3719 		}
3720 		return EINVAL;
3721 	}
3722 
3723 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3724 	VERIFY(m_tail == NULL || ext);
3725 	VERIFY(s != NULL || !ext);
3726 
3727 	/*
3728 	 * Drop the packet(s) if the parameters are invalid, or if the
3729 	 * interface is no longer attached; else hold an IO refcnt to
3730 	 * prevent it from being detached (will be released below.)
3731 	 */
3732 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3733 		if (m_head != NULL) {
3734 			mbuf_freem_list(m_head);
3735 		}
3736 		return EINVAL;
3737 	}
3738 
3739 	input_func = ifp->if_input_dlil;
3740 	VERIFY(input_func != NULL);
3741 
3742 	if (m_tail == NULL) {
3743 		last = m_head;
3744 		while (m_head != NULL) {
3745 #if IFNET_INPUT_SANITY_CHK
3746 			if (__improbable(dlil_input_sanity_check != 0)) {
3747 				DLIL_INPUT_CHECK(last, ifp);
3748 			}
3749 #endif /* IFNET_INPUT_SANITY_CHK */
3750 			m_cnt++;
3751 			m_size += m_length(last);
3752 			if (mbuf_nextpkt(last) == NULL) {
3753 				break;
3754 			}
3755 			last = mbuf_nextpkt(last);
3756 		}
3757 		m_tail = last;
3758 	} else {
3759 #if IFNET_INPUT_SANITY_CHK
3760 		if (__improbable(dlil_input_sanity_check != 0)) {
3761 			last = m_head;
3762 			while (1) {
3763 				DLIL_INPUT_CHECK(last, ifp);
3764 				m_cnt++;
3765 				m_size += m_length(last);
3766 				if (mbuf_nextpkt(last) == NULL) {
3767 					break;
3768 				}
3769 				last = mbuf_nextpkt(last);
3770 			}
3771 		} else {
3772 			m_cnt = s->packets_in;
3773 			m_size = s->bytes_in;
3774 			last = m_tail;
3775 		}
3776 #else
3777 		m_cnt = s->packets_in;
3778 		m_size = s->bytes_in;
3779 		last = m_tail;
3780 #endif /* IFNET_INPUT_SANITY_CHK */
3781 	}
3782 
3783 	if (last != m_tail) {
3784 		panic_plain("%s: invalid input packet chain for %s, "
3785 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3786 		    m_tail, last);
3787 	}
3788 
3789 	/*
3790 	 * Assert packet count only for the extended variant, for backwards
3791 	 * compatibility, since this came directly from the device driver.
3792 	 * Relax this assertion for input bytes, as the driver may have
3793 	 * included the link-layer headers in the computation; hence
3794 	 * m_size is just an approximation.
3795 	 */
3796 	if (ext && s->packets_in != m_cnt) {
3797 		panic_plain("%s: input packet count mismatch for %s, "
3798 		    "%d instead of %d\n", __func__, if_name(ifp),
3799 		    s->packets_in, m_cnt);
3800 	}
3801 
3802 	if (s == NULL) {
3803 		bzero(&_s, sizeof(_s));
3804 		s = &_s;
3805 	} else {
3806 		_s = *s;
3807 	}
3808 	_s.packets_in = m_cnt;
3809 	_s.bytes_in = m_size;
3810 
3811 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3812 
3813 	if (ifp != lo_ifp) {
3814 		/* Release the IO refcnt */
3815 		ifnet_datamov_end(ifp);
3816 	}
3817 
3818 	return err;
3819 }
3820 
3821 #if SKYWALK
3822 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3823 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3824 {
3825 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3826 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3827 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3828 }
3829 
3830 void
dlil_reset_input_handler(struct ifnet * ifp)3831 dlil_reset_input_handler(struct ifnet *ifp)
3832 {
3833 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3834 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3835 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3836 		;
3837 	}
3838 }
3839 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3840 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3841 {
3842 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3843 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3844 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3845 }
3846 
3847 void
dlil_reset_output_handler(struct ifnet * ifp)3848 dlil_reset_output_handler(struct ifnet *ifp)
3849 {
3850 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3851 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3852 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3853 		;
3854 	}
3855 }
3856 #endif /* SKYWALK */
3857 
3858 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3859 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3860 {
3861 	return ifp->if_output(ifp, m);
3862 }
3863 
3864 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3865 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3866     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3867     boolean_t poll, struct thread *tp)
3868 {
3869 	struct dlil_threading_info *inp = ifp->if_inp;
3870 
3871 	if (__improbable(inp == NULL)) {
3872 		inp = dlil_main_input_thread;
3873 	}
3874 
3875 	return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3876 }
3877 
3878 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3879 dlil_input_async(struct dlil_threading_info *inp,
3880     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3881     const struct ifnet_stat_increment_param *s, boolean_t poll,
3882     struct thread *tp)
3883 {
3884 	u_int32_t m_cnt = s->packets_in;
3885 	u_int32_t m_size = s->bytes_in;
3886 	boolean_t notify = FALSE;
3887 
3888 	/*
3889 	 * If there is a matching DLIL input thread associated with an
3890 	 * affinity set, associate this thread with the same set.  We
3891 	 * will only do this once.
3892 	 */
3893 	lck_mtx_lock_spin(&inp->dlth_lock);
3894 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3895 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3896 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3897 		u_int32_t tag = inp->dlth_affinity_tag;
3898 
3899 		if (poll) {
3900 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3901 			inp->dlth_poller_thread = tp;
3902 		} else {
3903 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3904 			inp->dlth_driver_thread = tp;
3905 		}
3906 		lck_mtx_unlock(&inp->dlth_lock);
3907 
3908 		/* Associate the current thread with the new affinity tag */
3909 		(void) dlil_affinity_set(tp, tag);
3910 
3911 		/*
3912 		 * Take a reference on the current thread; during detach,
3913 		 * we will need to refer to it in order to tear down its
3914 		 * affinity.
3915 		 */
3916 		thread_reference(tp);
3917 		lck_mtx_lock_spin(&inp->dlth_lock);
3918 	}
3919 
3920 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3921 
3922 	/*
3923 	 * Because of loopbacked multicast we cannot stuff the ifp in
3924 	 * the rcvif of the packet header: loopback (lo0) packets use a
3925 	 * dedicated list so that we can later associate them with lo_ifp
3926 	 * on their way up the stack.  Packets for other interfaces without
3927 	 * dedicated input threads go to the regular list.
3928 	 */
3929 	if (m_head != NULL) {
3930 		classq_pkt_t head, tail;
3931 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
3932 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3933 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3934 			struct dlil_main_threading_info *inpm =
3935 			    (struct dlil_main_threading_info *)inp;
3936 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
3937 			    m_cnt, m_size);
3938 		} else {
3939 			_addq_multi(&inp->dlth_pkts, &head, &tail,
3940 			    m_cnt, m_size);
3941 		}
3942 	}
3943 
3944 #if IFNET_INPUT_SANITY_CHK
3945 	if (__improbable(dlil_input_sanity_check != 0)) {
3946 		u_int32_t count = 0, size = 0;
3947 		struct mbuf *m0;
3948 
3949 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3950 			size += m_length(m0);
3951 			count++;
3952 		}
3953 
3954 		if (count != m_cnt) {
3955 			panic_plain("%s: invalid total packet count %u "
3956 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3957 			/* NOTREACHED */
3958 			__builtin_unreachable();
3959 		} else if (size != m_size) {
3960 			panic_plain("%s: invalid total packet size %u "
3961 			    "(expected %u)\n", if_name(ifp), size, m_size);
3962 			/* NOTREACHED */
3963 			__builtin_unreachable();
3964 		}
3965 
3966 		inp->dlth_pkts_cnt += m_cnt;
3967 	}
3968 #endif /* IFNET_INPUT_SANITY_CHK */
3969 
3970 	dlil_input_stats_add(s, inp, ifp, poll);
3971 	/*
3972 	 * If we're using the main input thread, synchronize the
3973 	 * stats now since we have the interface context.  All
3974 	 * other cases involving dedicated input threads will
3975 	 * have their stats synchronized there.
3976 	 */
3977 	if (inp == dlil_main_input_thread) {
3978 		notify = dlil_input_stats_sync(ifp, inp);
3979 	}
3980 
3981 	dlil_input_wakeup(inp);
3982 	lck_mtx_unlock(&inp->dlth_lock);
3983 
3984 	if (notify) {
3985 		ifnet_notify_data_threshold(ifp);
3986 	}
3987 
3988 	return 0;
3989 }
3990 
3991 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3992 dlil_input_sync(struct dlil_threading_info *inp,
3993     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3994     const struct ifnet_stat_increment_param *s, boolean_t poll,
3995     struct thread *tp)
3996 {
3997 #pragma unused(tp)
3998 	u_int32_t m_cnt = s->packets_in;
3999 	u_int32_t m_size = s->bytes_in;
4000 	boolean_t notify = FALSE;
4001 	classq_pkt_t head, tail;
4002 
4003 	ASSERT(inp != dlil_main_input_thread);
4004 
4005 	/* XXX: should we just assert instead? */
4006 	if (__improbable(m_head == NULL)) {
4007 		return 0;
4008 	}
4009 
4010 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4011 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4012 
4013 	lck_mtx_lock_spin(&inp->dlth_lock);
4014 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4015 
4016 #if IFNET_INPUT_SANITY_CHK
4017 	if (__improbable(dlil_input_sanity_check != 0)) {
4018 		u_int32_t count = 0, size = 0;
4019 		struct mbuf *m0;
4020 
4021 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4022 			size += m_length(m0);
4023 			count++;
4024 		}
4025 
4026 		if (count != m_cnt) {
4027 			panic_plain("%s: invalid total packet count %u "
4028 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4029 			/* NOTREACHED */
4030 			__builtin_unreachable();
4031 		} else if (size != m_size) {
4032 			panic_plain("%s: invalid total packet size %u "
4033 			    "(expected %u)\n", if_name(ifp), size, m_size);
4034 			/* NOTREACHED */
4035 			__builtin_unreachable();
4036 		}
4037 
4038 		inp->dlth_pkts_cnt += m_cnt;
4039 	}
4040 #endif /* IFNET_INPUT_SANITY_CHK */
4041 
4042 	dlil_input_stats_add(s, inp, ifp, poll);
4043 
4044 	m_cnt = qlen(&inp->dlth_pkts);
4045 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4046 
4047 #if SKYWALK
4048 	/*
4049 	 * If this interface is attached to a netif nexus,
4050 	 * the stats are already incremented there; otherwise
4051 	 * do it here.
4052 	 */
4053 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4054 #endif /* SKYWALK */
4055 	notify = dlil_input_stats_sync(ifp, inp);
4056 
4057 	lck_mtx_unlock(&inp->dlth_lock);
4058 
4059 	if (notify) {
4060 		ifnet_notify_data_threshold(ifp);
4061 	}
4062 
4063 	/*
4064 	 * NOTE warning %%% attention !!!!
4065 	 * We should think about putting some thread starvation
4066 	 * safeguards if we deal with long chains of packets.
4067 	 */
4068 	if (head.cp_mbuf != NULL) {
4069 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4070 		    m_cnt, ifp->if_poll_mode);
4071 	}
4072 
4073 	return 0;
4074 }
4075 
4076 #if SKYWALK
4077 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4078 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4079 {
4080 	return atomic_test_set_ptr(&ifp->if_output,
4081 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4082 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4083 }
4084 
4085 void
ifnet_reset_output_handler(struct ifnet * ifp)4086 ifnet_reset_output_handler(struct ifnet *ifp)
4087 {
4088 	while (!atomic_test_set_ptr(&ifp->if_output,
4089 	    ptrauth_nop_cast(void *, ifp->if_output),
4090 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4091 		;
4092 	}
4093 }
4094 
4095 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4096 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4097 {
4098 	return atomic_test_set_ptr(&ifp->if_start,
4099 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4100 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4101 }
4102 
4103 void
ifnet_reset_start_handler(struct ifnet * ifp)4104 ifnet_reset_start_handler(struct ifnet *ifp)
4105 {
4106 	while (!atomic_test_set_ptr(&ifp->if_start,
4107 	    ptrauth_nop_cast(void *, ifp->if_start),
4108 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4109 		;
4110 	}
4111 }
4112 #endif /* SKYWALK */
4113 
4114 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4115 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4116 {
4117 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4118 		return;
4119 	}
4120 	/*
4121 	 * If the starter thread is inactive, signal it to do work,
4122 	 * unless the interface is being flow controlled from below,
4123 	 * e.g. a virtual interface being flow controlled by a real
4124 	 * network interface beneath it, or it's been disabled via
4125 	 * a call to ifnet_disable_output().
4126 	 */
4127 	lck_mtx_lock_spin(&ifp->if_start_lock);
4128 	if (resetfc) {
4129 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4130 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4131 		lck_mtx_unlock(&ifp->if_start_lock);
4132 		return;
4133 	}
4134 	ifp->if_start_req++;
4135 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4136 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4137 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4138 	    ifp->if_start_delayed == 0)) {
4139 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4140 	}
4141 	lck_mtx_unlock(&ifp->if_start_lock);
4142 }
4143 
4144 void
ifnet_start(struct ifnet * ifp)4145 ifnet_start(struct ifnet *ifp)
4146 {
4147 	ifnet_start_common(ifp, FALSE);
4148 }
4149 
4150 __attribute__((noreturn))
4151 static void
ifnet_start_thread_func(void * v,wait_result_t w)4152 ifnet_start_thread_func(void *v, wait_result_t w)
4153 {
4154 #pragma unused(w)
4155 	struct ifnet *ifp = v;
4156 	char thread_name[MAXTHREADNAMESIZE];
4157 
4158 	/* Construct the name for this thread, and then apply it. */
4159 	bzero(thread_name, sizeof(thread_name));
4160 	(void) snprintf(thread_name, sizeof(thread_name),
4161 	    "ifnet_start_%s", ifp->if_xname);
4162 #if SKYWALK
4163 	/* override name for native Skywalk interface */
4164 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4165 		(void) snprintf(thread_name, sizeof(thread_name),
4166 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4167 	}
4168 #endif /* SKYWALK */
4169 	ASSERT(ifp->if_start_thread == current_thread());
4170 	thread_set_thread_name(current_thread(), thread_name);
4171 
4172 	/*
4173 	 * Treat the dedicated starter thread for lo0 as equivalent to
4174 	 * the driver workloop thread; if net_affinity is enabled for
4175 	 * the main input thread, associate this starter thread to it
4176 	 * by binding them with the same affinity tag.  This is done
4177 	 * only once (as we only have one lo_ifp which never goes away.)
4178 	 */
4179 	if (ifp == lo_ifp) {
4180 		struct dlil_threading_info *inp = dlil_main_input_thread;
4181 		struct thread *tp = current_thread();
4182 #if SKYWALK
4183 		/* native skywalk loopback not yet implemented */
4184 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4185 #endif /* SKYWALK */
4186 
4187 		lck_mtx_lock(&inp->dlth_lock);
4188 		if (inp->dlth_affinity) {
4189 			u_int32_t tag = inp->dlth_affinity_tag;
4190 
4191 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4192 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4193 			inp->dlth_driver_thread = tp;
4194 			lck_mtx_unlock(&inp->dlth_lock);
4195 
4196 			/* Associate this thread with the affinity tag */
4197 			(void) dlil_affinity_set(tp, tag);
4198 		} else {
4199 			lck_mtx_unlock(&inp->dlth_lock);
4200 		}
4201 	}
4202 
4203 	lck_mtx_lock(&ifp->if_start_lock);
4204 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4205 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4206 	ifp->if_start_embryonic = 1;
4207 	/* wake up once to get out of embryonic state */
4208 	ifp->if_start_req++;
4209 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4210 	lck_mtx_unlock(&ifp->if_start_lock);
4211 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4212 	/* NOTREACHED */
4213 	__builtin_unreachable();
4214 }
4215 
4216 __attribute__((noreturn))
4217 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4218 ifnet_start_thread_cont(void *v, wait_result_t wres)
4219 {
4220 	struct ifnet *ifp = v;
4221 	struct ifclassq *ifq = ifp->if_snd;
4222 
4223 	lck_mtx_lock_spin(&ifp->if_start_lock);
4224 	if (__improbable(wres == THREAD_INTERRUPTED ||
4225 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4226 		goto terminate;
4227 	}
4228 
4229 	if (__improbable(ifp->if_start_embryonic)) {
4230 		ifp->if_start_embryonic = 0;
4231 		lck_mtx_unlock(&ifp->if_start_lock);
4232 		ifnet_decr_pending_thread_count(ifp);
4233 		lck_mtx_lock_spin(&ifp->if_start_lock);
4234 		goto skip;
4235 	}
4236 
4237 	ifp->if_start_active = 1;
4238 
4239 	/*
4240 	 * Keep on servicing until no more request.
4241 	 */
4242 	for (;;) {
4243 		u_int32_t req = ifp->if_start_req;
4244 		if (!IFCQ_IS_EMPTY(ifq) &&
4245 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4246 		    ifp->if_start_delayed == 0 &&
4247 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4248 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4249 			ifp->if_start_delayed = 1;
4250 			ifnet_start_delayed++;
4251 			break;
4252 		}
4253 		ifp->if_start_delayed = 0;
4254 		lck_mtx_unlock(&ifp->if_start_lock);
4255 
4256 		/*
4257 		 * If no longer attached, don't call start because ifp
4258 		 * is being destroyed; else hold an IO refcnt to
4259 		 * prevent the interface from being detached (will be
4260 		 * released below.)
4261 		 */
4262 		if (!ifnet_datamov_begin(ifp)) {
4263 			lck_mtx_lock_spin(&ifp->if_start_lock);
4264 			break;
4265 		}
4266 
4267 		/* invoke the driver's start routine */
4268 		((*ifp->if_start)(ifp));
4269 
4270 		/*
4271 		 * Release the io ref count taken above.
4272 		 */
4273 		ifnet_datamov_end(ifp);
4274 
4275 		lck_mtx_lock_spin(&ifp->if_start_lock);
4276 
4277 		/*
4278 		 * If there's no pending request or if the
4279 		 * interface has been disabled, we're done.
4280 		 */
4281 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4282 		if (req == ifp->if_start_req ||
4283 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4284 			break;
4285 		}
4286 	}
4287 skip:
4288 	ifp->if_start_req = 0;
4289 	ifp->if_start_active = 0;
4290 
4291 #if SKYWALK
4292 	/*
4293 	 * Wakeup any waiters, e.g. any threads waiting to
4294 	 * detach the interface from the flowswitch, etc.
4295 	 */
4296 	if (ifp->if_start_waiters != 0) {
4297 		ifp->if_start_waiters = 0;
4298 		wakeup(&ifp->if_start_waiters);
4299 	}
4300 #endif /* SKYWALK */
4301 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4302 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4303 		struct timespec delay_start_ts;
4304 		struct timespec *ts;
4305 
4306 		/*
4307 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4308 		 * there are still packets in the send queue which haven't
4309 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4310 		 * until ifnet_start() is called again.
4311 		 */
4312 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4313 		    &ifp->if_start_cycle : NULL);
4314 
4315 		if (ts == NULL && ifp->if_start_delayed == 1) {
4316 			delay_start_ts.tv_sec = 0;
4317 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4318 			ts = &delay_start_ts;
4319 		}
4320 
4321 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4322 			ts = NULL;
4323 		}
4324 
4325 		if (__improbable(ts != NULL)) {
4326 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4327 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4328 		}
4329 
4330 		(void) assert_wait_deadline(&ifp->if_start_thread,
4331 		    THREAD_UNINT, deadline);
4332 		lck_mtx_unlock(&ifp->if_start_lock);
4333 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4334 		/* NOTREACHED */
4335 	} else {
4336 terminate:
4337 		/* interface is detached? */
4338 		ifnet_set_start_cycle(ifp, NULL);
4339 
4340 		/* clear if_start_thread to allow termination to continue */
4341 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4342 		ifp->if_start_thread = THREAD_NULL;
4343 		wakeup((caddr_t)&ifp->if_start_thread);
4344 		lck_mtx_unlock(&ifp->if_start_lock);
4345 
4346 		if (dlil_verbose) {
4347 			DLIL_PRINTF("%s: starter thread terminated\n",
4348 			    if_name(ifp));
4349 		}
4350 
4351 		/* for the extra refcnt from kernel_thread_start() */
4352 		thread_deallocate(current_thread());
4353 		/* this is the end */
4354 		thread_terminate(current_thread());
4355 		/* NOTREACHED */
4356 	}
4357 
4358 	/* must never get here */
4359 	VERIFY(0);
4360 	/* NOTREACHED */
4361 	__builtin_unreachable();
4362 }
4363 
4364 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4365 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4366 {
4367 	if (ts == NULL) {
4368 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4369 	} else {
4370 		*(&ifp->if_start_cycle) = *ts;
4371 	}
4372 
4373 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4374 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4375 		    if_name(ifp), ts->tv_nsec);
4376 	}
4377 }
4378 
4379 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4380 ifnet_poll_wakeup(struct ifnet *ifp)
4381 {
4382 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4383 
4384 	ifp->if_poll_req++;
4385 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4386 	    ifp->if_poll_thread != THREAD_NULL) {
4387 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4388 	}
4389 }
4390 
4391 void
ifnet_poll(struct ifnet * ifp)4392 ifnet_poll(struct ifnet *ifp)
4393 {
4394 	/*
4395 	 * If the poller thread is inactive, signal it to do work.
4396 	 */
4397 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4398 	ifnet_poll_wakeup(ifp);
4399 	lck_mtx_unlock(&ifp->if_poll_lock);
4400 }
4401 
4402 __attribute__((noreturn))
4403 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4404 ifnet_poll_thread_func(void *v, wait_result_t w)
4405 {
4406 #pragma unused(w)
4407 	char thread_name[MAXTHREADNAMESIZE];
4408 	struct ifnet *ifp = v;
4409 
4410 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4411 	VERIFY(current_thread() == ifp->if_poll_thread);
4412 
4413 	/* construct the name for this thread, and then apply it */
4414 	bzero(thread_name, sizeof(thread_name));
4415 	(void) snprintf(thread_name, sizeof(thread_name),
4416 	    "ifnet_poller_%s", ifp->if_xname);
4417 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4418 
4419 	lck_mtx_lock(&ifp->if_poll_lock);
4420 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4421 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4422 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4423 	/* wake up once to get out of embryonic state */
4424 	ifnet_poll_wakeup(ifp);
4425 	lck_mtx_unlock(&ifp->if_poll_lock);
4426 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4427 	/* NOTREACHED */
4428 	__builtin_unreachable();
4429 }
4430 
4431 __attribute__((noreturn))
4432 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4433 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4434 {
4435 	struct dlil_threading_info *inp;
4436 	struct ifnet *ifp = v;
4437 	struct ifnet_stat_increment_param s;
4438 	struct timespec start_time;
4439 
4440 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4441 
4442 	bzero(&s, sizeof(s));
4443 	net_timerclear(&start_time);
4444 
4445 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4446 	if (__improbable(wres == THREAD_INTERRUPTED ||
4447 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4448 		goto terminate;
4449 	}
4450 
4451 	inp = ifp->if_inp;
4452 	VERIFY(inp != NULL);
4453 
4454 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4455 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4456 		lck_mtx_unlock(&ifp->if_poll_lock);
4457 		ifnet_decr_pending_thread_count(ifp);
4458 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4459 		goto skip;
4460 	}
4461 
4462 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4463 
4464 	/*
4465 	 * Keep on servicing until no more request.
4466 	 */
4467 	for (;;) {
4468 		struct mbuf *m_head, *m_tail;
4469 		u_int32_t m_lim, m_cnt, m_totlen;
4470 		u_int16_t req = ifp->if_poll_req;
4471 
4472 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4473 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4474 		lck_mtx_unlock(&ifp->if_poll_lock);
4475 
4476 		/*
4477 		 * If no longer attached, there's nothing to do;
4478 		 * else hold an IO refcnt to prevent the interface
4479 		 * from being detached (will be released below.)
4480 		 */
4481 		if (!ifnet_is_attached(ifp, 1)) {
4482 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4483 			break;
4484 		}
4485 
4486 		if (dlil_verbose > 1) {
4487 			DLIL_PRINTF("%s: polling up to %d pkts, "
4488 			    "pkts avg %d max %d, wreq avg %d, "
4489 			    "bytes avg %d\n",
4490 			    if_name(ifp), m_lim,
4491 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4492 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4493 		}
4494 
4495 		/* invoke the driver's input poll routine */
4496 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4497 		&m_cnt, &m_totlen));
4498 
4499 		if (m_head != NULL) {
4500 			VERIFY(m_tail != NULL && m_cnt > 0);
4501 
4502 			if (dlil_verbose > 1) {
4503 				DLIL_PRINTF("%s: polled %d pkts, "
4504 				    "pkts avg %d max %d, wreq avg %d, "
4505 				    "bytes avg %d\n",
4506 				    if_name(ifp), m_cnt,
4507 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4508 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4509 			}
4510 
4511 			/* stats are required for extended variant */
4512 			s.packets_in = m_cnt;
4513 			s.bytes_in = m_totlen;
4514 
4515 			(void) ifnet_input_common(ifp, m_head, m_tail,
4516 			    &s, TRUE, TRUE);
4517 		} else {
4518 			if (dlil_verbose > 1) {
4519 				DLIL_PRINTF("%s: no packets, "
4520 				    "pkts avg %d max %d, wreq avg %d, "
4521 				    "bytes avg %d\n",
4522 				    if_name(ifp), ifp->if_rxpoll_pavg,
4523 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4524 				    ifp->if_rxpoll_bavg);
4525 			}
4526 
4527 			(void) ifnet_input_common(ifp, NULL, NULL,
4528 			    NULL, FALSE, TRUE);
4529 		}
4530 
4531 		/* Release the io ref count */
4532 		ifnet_decr_iorefcnt(ifp);
4533 
4534 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4535 
4536 		/* if there's no pending request, we're done */
4537 		if (req == ifp->if_poll_req ||
4538 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4539 			break;
4540 		}
4541 	}
4542 skip:
4543 	ifp->if_poll_req = 0;
4544 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4545 
4546 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4547 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4548 		struct timespec *ts;
4549 
4550 		/*
4551 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4552 		 * until ifnet_poll() is called again.
4553 		 */
4554 		ts = &ifp->if_poll_cycle;
4555 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4556 			ts = NULL;
4557 		}
4558 
4559 		if (ts != NULL) {
4560 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4561 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4562 		}
4563 
4564 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4565 		    THREAD_UNINT, deadline);
4566 		lck_mtx_unlock(&ifp->if_poll_lock);
4567 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4568 		/* NOTREACHED */
4569 	} else {
4570 terminate:
4571 		/* interface is detached (maybe while asleep)? */
4572 		ifnet_set_poll_cycle(ifp, NULL);
4573 
4574 		/* clear if_poll_thread to allow termination to continue */
4575 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4576 		ifp->if_poll_thread = THREAD_NULL;
4577 		wakeup((caddr_t)&ifp->if_poll_thread);
4578 		lck_mtx_unlock(&ifp->if_poll_lock);
4579 
4580 		if (dlil_verbose) {
4581 			DLIL_PRINTF("%s: poller thread terminated\n",
4582 			    if_name(ifp));
4583 		}
4584 
4585 		/* for the extra refcnt from kernel_thread_start() */
4586 		thread_deallocate(current_thread());
4587 		/* this is the end */
4588 		thread_terminate(current_thread());
4589 		/* NOTREACHED */
4590 	}
4591 
4592 	/* must never get here */
4593 	VERIFY(0);
4594 	/* NOTREACHED */
4595 	__builtin_unreachable();
4596 }
4597 
4598 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4599 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4600 {
4601 	if (ts == NULL) {
4602 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4603 	} else {
4604 		*(&ifp->if_poll_cycle) = *ts;
4605 	}
4606 
4607 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4608 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4609 		    if_name(ifp), ts->tv_nsec);
4610 	}
4611 }
4612 
4613 void
ifnet_purge(struct ifnet * ifp)4614 ifnet_purge(struct ifnet *ifp)
4615 {
4616 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4617 		if_qflush_snd(ifp, false);
4618 	}
4619 }
4620 
4621 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4622 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4623 {
4624 	IFCQ_LOCK_ASSERT_HELD(ifq);
4625 
4626 	if (!(IFCQ_IS_READY(ifq))) {
4627 		return;
4628 	}
4629 
4630 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4631 		struct tb_profile tb = {
4632 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4633 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4634 		};
4635 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4636 	}
4637 
4638 	ifclassq_update(ifq, ev);
4639 }
4640 
4641 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4642 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4643 {
4644 	switch (ev) {
4645 	case CLASSQ_EV_LINK_BANDWIDTH:
4646 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4647 			ifp->if_poll_update++;
4648 		}
4649 		break;
4650 
4651 	default:
4652 		break;
4653 	}
4654 }
4655 
4656 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4657 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4658 {
4659 	struct ifclassq *ifq;
4660 	u_int32_t omodel;
4661 	errno_t err;
4662 
4663 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4664 		return EINVAL;
4665 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4666 		return ENXIO;
4667 	}
4668 
4669 	ifq = ifp->if_snd;
4670 	IFCQ_LOCK(ifq);
4671 	omodel = ifp->if_output_sched_model;
4672 	ifp->if_output_sched_model = model;
4673 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4674 		ifp->if_output_sched_model = omodel;
4675 	}
4676 	IFCQ_UNLOCK(ifq);
4677 
4678 	return err;
4679 }
4680 
4681 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4682 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4683 {
4684 	if (ifp == NULL) {
4685 		return EINVAL;
4686 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4687 		return ENXIO;
4688 	}
4689 
4690 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4691 
4692 	return 0;
4693 }
4694 
4695 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4696 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4697 {
4698 	if (ifp == NULL || maxqlen == NULL) {
4699 		return EINVAL;
4700 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4701 		return ENXIO;
4702 	}
4703 
4704 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4705 
4706 	return 0;
4707 }
4708 
4709 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4710 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4711 {
4712 	errno_t err;
4713 
4714 	if (ifp == NULL || pkts == NULL) {
4715 		err = EINVAL;
4716 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4717 		err = ENXIO;
4718 	} else {
4719 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4720 		    pkts, NULL);
4721 	}
4722 
4723 	return err;
4724 }
4725 
4726 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4727 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4728     u_int32_t *pkts, u_int32_t *bytes)
4729 {
4730 	errno_t err;
4731 
4732 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4733 	    (pkts == NULL && bytes == NULL)) {
4734 		err = EINVAL;
4735 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4736 		err = ENXIO;
4737 	} else {
4738 		err = ifclassq_get_len(ifp->if_snd, sc, pkts, bytes);
4739 	}
4740 
4741 	return err;
4742 }
4743 
4744 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4745 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4746 {
4747 	struct dlil_threading_info *inp;
4748 
4749 	if (ifp == NULL) {
4750 		return EINVAL;
4751 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4752 		return ENXIO;
4753 	}
4754 
4755 	if (maxqlen == 0) {
4756 		maxqlen = if_rcvq_maxlen;
4757 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4758 		maxqlen = IF_RCVQ_MINLEN;
4759 	}
4760 
4761 	inp = ifp->if_inp;
4762 	lck_mtx_lock(&inp->dlth_lock);
4763 	qlimit(&inp->dlth_pkts) = maxqlen;
4764 	lck_mtx_unlock(&inp->dlth_lock);
4765 
4766 	return 0;
4767 }
4768 
4769 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4770 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4771 {
4772 	struct dlil_threading_info *inp;
4773 
4774 	if (ifp == NULL || maxqlen == NULL) {
4775 		return EINVAL;
4776 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4777 		return ENXIO;
4778 	}
4779 
4780 	inp = ifp->if_inp;
4781 	lck_mtx_lock(&inp->dlth_lock);
4782 	*maxqlen = qlimit(&inp->dlth_pkts);
4783 	lck_mtx_unlock(&inp->dlth_lock);
4784 	return 0;
4785 }
4786 
4787 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4788 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4789     uint16_t delay_timeout)
4790 {
4791 	if (delay_qlen > 0 && delay_timeout > 0) {
4792 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4793 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4794 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4795 		/* convert timeout to nanoseconds */
4796 		ifp->if_start_delay_timeout *= 1000;
4797 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4798 		    ifp->if_xname, (uint32_t)delay_qlen,
4799 		    (uint32_t)delay_timeout);
4800 	} else {
4801 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4802 	}
4803 }
4804 
4805 /*
4806  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4807  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4808  * buf holds the full header.
4809  */
4810 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4811 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4812 {
4813 	struct ip *ip;
4814 	struct ip6_hdr *ip6;
4815 	uint8_t lbuf[64] __attribute__((aligned(8)));
4816 	uint8_t *p = buf;
4817 
4818 	if (ip_ver == IPVERSION) {
4819 		uint8_t old_tos;
4820 		uint32_t sum;
4821 
4822 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4823 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4824 			bcopy(buf, lbuf, sizeof(struct ip));
4825 			p = lbuf;
4826 		}
4827 		ip = (struct ip *)(void *)p;
4828 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4829 			return;
4830 		}
4831 
4832 		DTRACE_IP1(clear__v4, struct ip *, ip);
4833 		old_tos = ip->ip_tos;
4834 		ip->ip_tos &= IPTOS_ECN_MASK;
4835 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4836 		sum = (sum >> 16) + (sum & 0xffff);
4837 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4838 
4839 		if (__improbable(p == lbuf)) {
4840 			bcopy(lbuf, buf, sizeof(struct ip));
4841 		}
4842 	} else {
4843 		uint32_t flow;
4844 		ASSERT(ip_ver == IPV6_VERSION);
4845 
4846 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4847 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4848 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4849 			p = lbuf;
4850 		}
4851 		ip6 = (struct ip6_hdr *)(void *)p;
4852 		flow = ntohl(ip6->ip6_flow);
4853 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4854 			return;
4855 		}
4856 
4857 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4858 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4859 
4860 		if (__improbable(p == lbuf)) {
4861 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4862 		}
4863 	}
4864 }
4865 
4866 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4867 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4868     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4869 {
4870 #if SKYWALK
4871 	volatile struct sk_nexusadv *nxadv = NULL;
4872 #endif /* SKYWALK */
4873 	volatile uint64_t *fg_ts = NULL;
4874 	volatile uint64_t *rt_ts = NULL;
4875 	struct timespec now;
4876 	u_int64_t now_nsec = 0;
4877 	int error = 0;
4878 	uint8_t *mcast_buf = NULL;
4879 	uint8_t ip_ver;
4880 	uint32_t pktlen;
4881 
4882 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4883 #if SKYWALK
4884 	/*
4885 	 * If attached to flowswitch, grab pointers to the
4886 	 * timestamp variables in the nexus advisory region.
4887 	 */
4888 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4889 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4890 		fg_ts = &nxadv->nxadv_fg_sendts;
4891 		rt_ts = &nxadv->nxadv_rt_sendts;
4892 	}
4893 #endif /* SKYWALK */
4894 
4895 	/*
4896 	 * If packet already carries a timestamp, either from dlil_output()
4897 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4898 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4899 	 * the timestamp value is used internally there.
4900 	 */
4901 	switch (p->cp_ptype) {
4902 	case QP_MBUF:
4903 #if SKYWALK
4904 		/*
4905 		 * Valid only for non-native (compat) Skywalk interface.
4906 		 * If the data source uses packet, caller must convert
4907 		 * it to mbuf first prior to calling this routine.
4908 		 */
4909 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4910 #endif /* SKYWALK */
4911 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4912 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4913 
4914 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4915 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4916 			nanouptime(&now);
4917 			net_timernsec(&now, &now_nsec);
4918 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4919 		}
4920 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4921 		/*
4922 		 * If the packet service class is not background,
4923 		 * update the timestamp to indicate recent activity
4924 		 * on a foreground socket.
4925 		 */
4926 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4927 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4928 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4929 			    PKTF_SO_BACKGROUND)) {
4930 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
4931 				if (fg_ts != NULL) {
4932 					*fg_ts = (uint32_t)_net_uptime;
4933 				}
4934 			}
4935 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4936 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
4937 				if (rt_ts != NULL) {
4938 					*rt_ts = (uint32_t)_net_uptime;
4939 				}
4940 			}
4941 		}
4942 		pktlen = m_pktlen(p->cp_mbuf);
4943 
4944 		/*
4945 		 * Some Wi-Fi AP implementations do not correctly handle
4946 		 * multicast IP packets with DSCP bits set (radr://9331522).
4947 		 * As a workaround we clear the DSCP bits but keep service
4948 		 * class (rdar://51507725).
4949 		 */
4950 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
4951 		    IFNET_IS_WIFI_INFRA(ifp)) {
4952 			size_t len = mbuf_len(p->cp_mbuf), hlen;
4953 			struct ether_header *eh;
4954 			boolean_t pullup = FALSE;
4955 			uint16_t etype;
4956 
4957 			if (__improbable(len < sizeof(struct ether_header))) {
4958 				DTRACE_IP1(small__ether, size_t, len);
4959 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
4960 				    sizeof(struct ether_header))) == NULL) {
4961 					return ENOMEM;
4962 				}
4963 			}
4964 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
4965 			etype = ntohs(eh->ether_type);
4966 			if (etype == ETHERTYPE_IP) {
4967 				hlen = sizeof(struct ether_header) +
4968 				    sizeof(struct ip);
4969 				if (len < hlen) {
4970 					DTRACE_IP1(small__v4, size_t, len);
4971 					pullup = TRUE;
4972 				}
4973 				ip_ver = IPVERSION;
4974 			} else if (etype == ETHERTYPE_IPV6) {
4975 				hlen = sizeof(struct ether_header) +
4976 				    sizeof(struct ip6_hdr);
4977 				if (len < hlen) {
4978 					DTRACE_IP1(small__v6, size_t, len);
4979 					pullup = TRUE;
4980 				}
4981 				ip_ver = IPV6_VERSION;
4982 			} else {
4983 				DTRACE_IP1(invalid__etype, uint16_t, etype);
4984 				break;
4985 			}
4986 			if (pullup) {
4987 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
4988 				    NULL) {
4989 					return ENOMEM;
4990 				}
4991 
4992 				eh = (struct ether_header *)mbuf_data(
4993 					p->cp_mbuf);
4994 			}
4995 			mcast_buf = (uint8_t *)(eh + 1);
4996 			/*
4997 			 * ifnet_mcast_clear_dscp() will finish the work below.
4998 			 * Note that the pullups above ensure that mcast_buf
4999 			 * points to a full IP header.
5000 			 */
5001 		}
5002 		break;
5003 
5004 #if SKYWALK
5005 	case QP_PACKET:
5006 		/*
5007 		 * Valid only for native Skywalk interface.  If the data
5008 		 * source uses mbuf, caller must convert it to packet first
5009 		 * prior to calling this routine.
5010 		 */
5011 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5012 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5013 		    p->cp_kpkt->pkt_timestamp == 0) {
5014 			nanouptime(&now);
5015 			net_timernsec(&now, &now_nsec);
5016 			p->cp_kpkt->pkt_timestamp = now_nsec;
5017 		}
5018 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5019 		/*
5020 		 * If the packet service class is not background,
5021 		 * update the timestamps on the interface, as well as
5022 		 * the ones in nexus-wide advisory to indicate recent
5023 		 * activity on a foreground flow.
5024 		 */
5025 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5026 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5027 			if (fg_ts != NULL) {
5028 				*fg_ts = (uint32_t)_net_uptime;
5029 			}
5030 		}
5031 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5032 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5033 			if (rt_ts != NULL) {
5034 				*rt_ts = (uint32_t)_net_uptime;
5035 			}
5036 		}
5037 		pktlen = p->cp_kpkt->pkt_length;
5038 
5039 		/*
5040 		 * Some Wi-Fi AP implementations do not correctly handle
5041 		 * multicast IP packets with DSCP bits set (radr://9331522).
5042 		 * As a workaround we clear the DSCP bits but keep service
5043 		 * class (rdar://51507725).
5044 		 */
5045 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5046 		    IFNET_IS_WIFI_INFRA(ifp)) {
5047 			uint8_t *baddr;
5048 			struct ether_header *eh;
5049 			uint16_t etype;
5050 
5051 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5052 			baddr += p->cp_kpkt->pkt_headroom;
5053 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5054 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5055 				    p->cp_kpkt);
5056 				break;
5057 			}
5058 			eh = (struct ether_header *)(void *)baddr;
5059 			etype = ntohs(eh->ether_type);
5060 			if (etype == ETHERTYPE_IP) {
5061 				if (pktlen < sizeof(struct ether_header) +
5062 				    sizeof(struct ip)) {
5063 					DTRACE_IP1(pkt__small__v4, uint32_t,
5064 					    pktlen);
5065 					break;
5066 				}
5067 				ip_ver = IPVERSION;
5068 			} else if (etype == ETHERTYPE_IPV6) {
5069 				if (pktlen < sizeof(struct ether_header) +
5070 				    sizeof(struct ip6_hdr)) {
5071 					DTRACE_IP1(pkt__small__v6, uint32_t,
5072 					    pktlen);
5073 					break;
5074 				}
5075 				ip_ver = IPV6_VERSION;
5076 			} else {
5077 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5078 				    etype);
5079 				break;
5080 			}
5081 			mcast_buf = (uint8_t *)(eh + 1);
5082 			/*
5083 			 * ifnet_mcast_clear_dscp() will finish the work below.
5084 			 * The checks above verify that the IP header is in the
5085 			 * first buflet.
5086 			 */
5087 		}
5088 		break;
5089 #endif /* SKYWALK */
5090 
5091 	default:
5092 		VERIFY(0);
5093 		/* NOTREACHED */
5094 		__builtin_unreachable();
5095 	}
5096 
5097 	if (mcast_buf != NULL) {
5098 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5099 	}
5100 
5101 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5102 		if (now_nsec == 0) {
5103 			nanouptime(&now);
5104 			net_timernsec(&now, &now_nsec);
5105 		}
5106 		/*
5107 		 * If the driver chose to delay start callback for
5108 		 * coalescing multiple packets, Then use the following
5109 		 * heuristics to make sure that start callback will
5110 		 * be delayed only when bulk data transfer is detected.
5111 		 * 1. number of packets enqueued in (delay_win * 2) is
5112 		 * greater than or equal to the delay qlen.
5113 		 * 2. If delay_start is enabled it will stay enabled for
5114 		 * another 10 idle windows. This is to take into account
5115 		 * variable RTT and burst traffic.
5116 		 * 3. If the time elapsed since last enqueue is more
5117 		 * than 200ms we disable delaying start callback. This is
5118 		 * is to take idle time into account.
5119 		 */
5120 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5121 		if (ifp->if_start_delay_swin > 0) {
5122 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5123 				ifp->if_start_delay_cnt++;
5124 			} else if ((now_nsec - ifp->if_start_delay_swin)
5125 			    >= (200 * 1000 * 1000)) {
5126 				ifp->if_start_delay_swin = now_nsec;
5127 				ifp->if_start_delay_cnt = 1;
5128 				ifp->if_start_delay_idle = 0;
5129 				if (ifp->if_eflags & IFEF_DELAY_START) {
5130 					if_clear_eflags(ifp, IFEF_DELAY_START);
5131 					ifnet_delay_start_disabled_increment();
5132 				}
5133 			} else {
5134 				if (ifp->if_start_delay_cnt >=
5135 				    ifp->if_start_delay_qlen) {
5136 					if_set_eflags(ifp, IFEF_DELAY_START);
5137 					ifp->if_start_delay_idle = 0;
5138 				} else {
5139 					if (ifp->if_start_delay_idle >= 10) {
5140 						if_clear_eflags(ifp,
5141 						    IFEF_DELAY_START);
5142 						ifnet_delay_start_disabled_increment();
5143 					} else {
5144 						ifp->if_start_delay_idle++;
5145 					}
5146 				}
5147 				ifp->if_start_delay_swin = now_nsec;
5148 				ifp->if_start_delay_cnt = 1;
5149 			}
5150 		} else {
5151 			ifp->if_start_delay_swin = now_nsec;
5152 			ifp->if_start_delay_cnt = 1;
5153 			ifp->if_start_delay_idle = 0;
5154 			if_clear_eflags(ifp, IFEF_DELAY_START);
5155 		}
5156 	} else {
5157 		if_clear_eflags(ifp, IFEF_DELAY_START);
5158 	}
5159 
5160 	/* enqueue the packet (caller consumes object) */
5161 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5162 	    1, pktlen, pdrop);
5163 
5164 	/*
5165 	 * Tell the driver to start dequeueing; do this even when the queue
5166 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5167 	 * be dequeueing from other unsuspended queues.
5168 	 */
5169 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5170 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5171 		ifnet_start(ifp);
5172 	}
5173 
5174 	return error;
5175 }
5176 
5177 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5178 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, classq_pkt_t *head,
5179     classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5180     boolean_t *pdrop)
5181 {
5182 	int error;
5183 
5184 	/* enqueue the packet (caller consumes object) */
5185 	error = ifclassq_enqueue(ifp->if_snd, head, tail, cnt, bytes, pdrop);
5186 
5187 	/*
5188 	 * Tell the driver to start dequeueing; do this even when the queue
5189 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5190 	 * be dequeueing from other unsuspended queues.
5191 	 */
5192 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5193 		ifnet_start(ifp);
5194 	}
5195 	return error;
5196 }
5197 
5198 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5199 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5200 {
5201 	struct ifnet *ifp = handle;
5202 	boolean_t pdrop;        /* dummy */
5203 	uint32_t i;
5204 
5205 	ASSERT(n_pkts >= 1);
5206 	for (i = 0; i < n_pkts - 1; i++) {
5207 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5208 		    FALSE, &pdrop);
5209 	}
5210 	/* flush with the last packet */
5211 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5212 	    TRUE, &pdrop);
5213 
5214 	return 0;
5215 }
5216 
5217 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5218 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5219     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5220 {
5221 	if (ifp->if_output_netem != NULL) {
5222 		return netem_enqueue(ifp->if_output_netem, pkt, pdrop);
5223 	} else {
5224 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5225 	}
5226 }
5227 
5228 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5229 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5230 {
5231 	boolean_t pdrop;
5232 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5233 }
5234 
5235 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5236 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5237     boolean_t *pdrop)
5238 {
5239 	classq_pkt_t pkt;
5240 
5241 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5242 	    m->m_nextpkt != NULL) {
5243 		if (m != NULL) {
5244 			m_freem_list(m);
5245 			*pdrop = TRUE;
5246 		}
5247 		return EINVAL;
5248 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5249 	    !IF_FULLY_ATTACHED(ifp)) {
5250 		/* flag tested without lock for performance */
5251 		m_freem(m);
5252 		*pdrop = TRUE;
5253 		return ENXIO;
5254 	} else if (!(ifp->if_flags & IFF_UP)) {
5255 		m_freem(m);
5256 		*pdrop = TRUE;
5257 		return ENETDOWN;
5258 	}
5259 
5260 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5261 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5262 }
5263 
5264 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5265 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5266     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5267     boolean_t *pdrop)
5268 {
5269 	classq_pkt_t head, tail;
5270 
5271 	ASSERT(m_head != NULL);
5272 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5273 	ASSERT(m_tail != NULL);
5274 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5275 	ASSERT(ifp != NULL);
5276 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5277 
5278 	if (!IF_FULLY_ATTACHED(ifp)) {
5279 		/* flag tested without lock for performance */
5280 		m_freem_list(m_head);
5281 		*pdrop = TRUE;
5282 		return ENXIO;
5283 	} else if (!(ifp->if_flags & IFF_UP)) {
5284 		m_freem_list(m_head);
5285 		*pdrop = TRUE;
5286 		return ENETDOWN;
5287 	}
5288 
5289 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5290 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5291 	return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5292 	           flush, pdrop);
5293 }
5294 
5295 #if SKYWALK
5296 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5297 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5298     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5299 {
5300 	classq_pkt_t pkt;
5301 
5302 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5303 
5304 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5305 		if (kpkt != NULL) {
5306 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5307 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5308 			*pdrop = TRUE;
5309 		}
5310 		return EINVAL;
5311 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5312 	    !IF_FULLY_ATTACHED(ifp))) {
5313 		/* flag tested without lock for performance */
5314 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5315 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5316 		*pdrop = TRUE;
5317 		return ENXIO;
5318 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5319 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5320 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5321 		*pdrop = TRUE;
5322 		return ENETDOWN;
5323 	}
5324 
5325 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5326 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5327 }
5328 
5329 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5330 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5331     boolean_t flush, boolean_t *pdrop)
5332 {
5333 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5334 }
5335 
5336 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5337 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5338     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5339 {
5340 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5341 }
5342 
5343 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5344 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5345     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5346     boolean_t *pdrop)
5347 {
5348 	classq_pkt_t head, tail;
5349 
5350 	ASSERT(k_head != NULL);
5351 	ASSERT(k_tail != NULL);
5352 	ASSERT(ifp != NULL);
5353 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5354 
5355 	if (!IF_FULLY_ATTACHED(ifp)) {
5356 		/* flag tested without lock for performance */
5357 		pp_free_packet_chain(k_head, NULL);
5358 		*pdrop = TRUE;
5359 		return ENXIO;
5360 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5361 		pp_free_packet_chain(k_head, NULL);
5362 		*pdrop = TRUE;
5363 		return ENETDOWN;
5364 	}
5365 
5366 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5367 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5368 	return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5369 	           flush, pdrop);
5370 }
5371 #endif /* SKYWALK */
5372 
5373 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5374 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5375 {
5376 	errno_t rc;
5377 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5378 
5379 	if (ifp == NULL || mp == NULL) {
5380 		return EINVAL;
5381 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5382 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5383 		return ENXIO;
5384 	}
5385 	if (!ifnet_is_attached(ifp, 1)) {
5386 		return ENXIO;
5387 	}
5388 
5389 #if SKYWALK
5390 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5391 #endif /* SKYWALK */
5392 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5393 	    &pkt, NULL, NULL, NULL);
5394 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5395 	ifnet_decr_iorefcnt(ifp);
5396 	*mp = pkt.cp_mbuf;
5397 	return rc;
5398 }
5399 
5400 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5401 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5402     struct mbuf **mp)
5403 {
5404 	errno_t rc;
5405 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5406 
5407 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5408 		return EINVAL;
5409 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5410 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5411 		return ENXIO;
5412 	}
5413 	if (!ifnet_is_attached(ifp, 1)) {
5414 		return ENXIO;
5415 	}
5416 
5417 #if SKYWALK
5418 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5419 #endif /* SKYWALK */
5420 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5421 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL);
5422 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5423 	ifnet_decr_iorefcnt(ifp);
5424 	*mp = pkt.cp_mbuf;
5425 	return rc;
5426 }
5427 
5428 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5429 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5430     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5431 {
5432 	errno_t rc;
5433 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5434 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5435 
5436 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5437 		return EINVAL;
5438 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5439 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5440 		return ENXIO;
5441 	}
5442 	if (!ifnet_is_attached(ifp, 1)) {
5443 		return ENXIO;
5444 	}
5445 
5446 #if SKYWALK
5447 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5448 #endif /* SKYWALK */
5449 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5450 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len);
5451 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5452 	ifnet_decr_iorefcnt(ifp);
5453 	*head = pkt_head.cp_mbuf;
5454 	if (tail != NULL) {
5455 		*tail = pkt_tail.cp_mbuf;
5456 	}
5457 	return rc;
5458 }
5459 
5460 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5461 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5462     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5463 {
5464 	errno_t rc;
5465 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5466 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5467 
5468 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5469 		return EINVAL;
5470 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5471 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5472 		return ENXIO;
5473 	}
5474 	if (!ifnet_is_attached(ifp, 1)) {
5475 		return ENXIO;
5476 	}
5477 
5478 #if SKYWALK
5479 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5480 #endif /* SKYWALK */
5481 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5482 	    byte_limit, &pkt_head, &pkt_tail, cnt, len);
5483 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5484 	ifnet_decr_iorefcnt(ifp);
5485 	*head = pkt_head.cp_mbuf;
5486 	if (tail != NULL) {
5487 		*tail = pkt_tail.cp_mbuf;
5488 	}
5489 	return rc;
5490 }
5491 
5492 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5493 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5494     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5495     u_int32_t *len)
5496 {
5497 	errno_t rc;
5498 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5499 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5500 
5501 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5502 	    !MBUF_VALID_SC(sc)) {
5503 		return EINVAL;
5504 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5505 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5506 		return ENXIO;
5507 	}
5508 	if (!ifnet_is_attached(ifp, 1)) {
5509 		return ENXIO;
5510 	}
5511 
5512 #if SKYWALK
5513 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5514 #endif /* SKYWALK */
5515 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5516 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5517 	    cnt, len);
5518 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5519 	ifnet_decr_iorefcnt(ifp);
5520 	*head = pkt_head.cp_mbuf;
5521 	if (tail != NULL) {
5522 		*tail = pkt_tail.cp_mbuf;
5523 	}
5524 	return rc;
5525 }
5526 
5527 #if XNU_TARGET_OS_OSX
5528 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5529 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5530     const struct sockaddr *dest, const char *dest_linkaddr,
5531     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5532 {
5533 	if (pre != NULL) {
5534 		*pre = 0;
5535 	}
5536 	if (post != NULL) {
5537 		*post = 0;
5538 	}
5539 
5540 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5541 }
5542 #endif /* XNU_TARGET_OS_OSX */
5543 
5544 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5545 packet_has_vlan_tag(struct mbuf * m)
5546 {
5547 	u_int   tag = 0;
5548 
5549 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5550 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5551 		if (tag == 0) {
5552 			/* the packet is just priority-tagged, clear the bit */
5553 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5554 		}
5555 	}
5556 	return tag != 0;
5557 }
5558 
5559 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5560 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5561     char **frame_header_p, protocol_family_t protocol_family)
5562 {
5563 	boolean_t               is_vlan_packet = FALSE;
5564 	struct ifnet_filter     *filter;
5565 	struct mbuf             *m = *m_p;
5566 
5567 	is_vlan_packet = packet_has_vlan_tag(m);
5568 
5569 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5570 		return 0;
5571 	}
5572 
5573 	/*
5574 	 * Pass the inbound packet to the interface filters
5575 	 */
5576 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5577 	/* prevent filter list from changing in case we drop the lock */
5578 	if_flt_monitor_busy(ifp);
5579 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5580 		int result;
5581 
5582 		/* exclude VLAN packets from external filters PR-3586856 */
5583 		if (is_vlan_packet &&
5584 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5585 			continue;
5586 		}
5587 
5588 		if (!filter->filt_skip && filter->filt_input != NULL &&
5589 		    (filter->filt_protocol == 0 ||
5590 		    filter->filt_protocol == protocol_family)) {
5591 			lck_mtx_unlock(&ifp->if_flt_lock);
5592 
5593 			result = (*filter->filt_input)(filter->filt_cookie,
5594 			    ifp, protocol_family, m_p, frame_header_p);
5595 
5596 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5597 			if (result != 0) {
5598 				/* we're done with the filter list */
5599 				if_flt_monitor_unbusy(ifp);
5600 				lck_mtx_unlock(&ifp->if_flt_lock);
5601 				return result;
5602 			}
5603 		}
5604 	}
5605 	/* we're done with the filter list */
5606 	if_flt_monitor_unbusy(ifp);
5607 	lck_mtx_unlock(&ifp->if_flt_lock);
5608 
5609 	/*
5610 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5611 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5612 	 */
5613 	if (*m_p != NULL) {
5614 		(*m_p)->m_flags &= ~M_PROTO1;
5615 	}
5616 
5617 	return 0;
5618 }
5619 
5620 __attribute__((noinline))
5621 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5622 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5623     protocol_family_t protocol_family)
5624 {
5625 	boolean_t               is_vlan_packet;
5626 	struct ifnet_filter     *filter;
5627 	struct mbuf             *m = *m_p;
5628 
5629 	is_vlan_packet = packet_has_vlan_tag(m);
5630 
5631 	/*
5632 	 * Pass the outbound packet to the interface filters
5633 	 */
5634 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5635 	/* prevent filter list from changing in case we drop the lock */
5636 	if_flt_monitor_busy(ifp);
5637 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5638 		int result;
5639 
5640 		/* exclude VLAN packets from external filters PR-3586856 */
5641 		if (is_vlan_packet &&
5642 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5643 			continue;
5644 		}
5645 
5646 		if (!filter->filt_skip && filter->filt_output != NULL &&
5647 		    (filter->filt_protocol == 0 ||
5648 		    filter->filt_protocol == protocol_family)) {
5649 			lck_mtx_unlock(&ifp->if_flt_lock);
5650 
5651 			result = filter->filt_output(filter->filt_cookie, ifp,
5652 			    protocol_family, m_p);
5653 
5654 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5655 			if (result != 0) {
5656 				/* we're done with the filter list */
5657 				if_flt_monitor_unbusy(ifp);
5658 				lck_mtx_unlock(&ifp->if_flt_lock);
5659 				return result;
5660 			}
5661 		}
5662 	}
5663 	/* we're done with the filter list */
5664 	if_flt_monitor_unbusy(ifp);
5665 	lck_mtx_unlock(&ifp->if_flt_lock);
5666 
5667 	return 0;
5668 }
5669 
5670 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5671 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5672 {
5673 	int error;
5674 
5675 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5676 		/* Version 1 protocols get one packet at a time */
5677 		while (m != NULL) {
5678 			char *  frame_header;
5679 			mbuf_t  next_packet;
5680 
5681 			next_packet = m->m_nextpkt;
5682 			m->m_nextpkt = NULL;
5683 			frame_header = m->m_pkthdr.pkt_hdr;
5684 			m->m_pkthdr.pkt_hdr = NULL;
5685 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5686 			    ifproto->protocol_family, m, frame_header);
5687 			if (error != 0 && error != EJUSTRETURN) {
5688 				m_freem(m);
5689 			}
5690 			m = next_packet;
5691 		}
5692 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5693 		/* Version 2 protocols support packet lists */
5694 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5695 		    ifproto->protocol_family, m);
5696 		if (error != 0 && error != EJUSTRETURN) {
5697 			m_freem_list(m);
5698 		}
5699 	}
5700 }
5701 
5702 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5703 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5704     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5705 {
5706 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5707 
5708 	if (s->packets_in != 0) {
5709 		d->packets_in += s->packets_in;
5710 	}
5711 	if (s->bytes_in != 0) {
5712 		d->bytes_in += s->bytes_in;
5713 	}
5714 	if (s->errors_in != 0) {
5715 		d->errors_in += s->errors_in;
5716 	}
5717 
5718 	if (s->packets_out != 0) {
5719 		d->packets_out += s->packets_out;
5720 	}
5721 	if (s->bytes_out != 0) {
5722 		d->bytes_out += s->bytes_out;
5723 	}
5724 	if (s->errors_out != 0) {
5725 		d->errors_out += s->errors_out;
5726 	}
5727 
5728 	if (s->collisions != 0) {
5729 		d->collisions += s->collisions;
5730 	}
5731 	if (s->dropped != 0) {
5732 		d->dropped += s->dropped;
5733 	}
5734 
5735 	if (poll) {
5736 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5737 	}
5738 }
5739 
5740 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5741 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5742 {
5743 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5744 
5745 	/*
5746 	 * Use of atomic operations is unavoidable here because
5747 	 * these stats may also be incremented elsewhere via KPIs.
5748 	 */
5749 	if (s->packets_in != 0) {
5750 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5751 		s->packets_in = 0;
5752 	}
5753 	if (s->bytes_in != 0) {
5754 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5755 		s->bytes_in = 0;
5756 	}
5757 	if (s->errors_in != 0) {
5758 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5759 		s->errors_in = 0;
5760 	}
5761 
5762 	if (s->packets_out != 0) {
5763 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5764 		s->packets_out = 0;
5765 	}
5766 	if (s->bytes_out != 0) {
5767 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5768 		s->bytes_out = 0;
5769 	}
5770 	if (s->errors_out != 0) {
5771 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5772 		s->errors_out = 0;
5773 	}
5774 
5775 	if (s->collisions != 0) {
5776 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
5777 		s->collisions = 0;
5778 	}
5779 	if (s->dropped != 0) {
5780 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
5781 		s->dropped = 0;
5782 	}
5783 
5784 	/*
5785 	 * No need for atomic operations as they are modified here
5786 	 * only from within the DLIL input thread context.
5787 	 */
5788 	if (ifp->if_poll_tstats.packets != 0) {
5789 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5790 		ifp->if_poll_tstats.packets = 0;
5791 	}
5792 	if (ifp->if_poll_tstats.bytes != 0) {
5793 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5794 		ifp->if_poll_tstats.bytes = 0;
5795 	}
5796 
5797 	return ifp->if_data_threshold != 0;
5798 }
5799 
5800 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5801 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5802 {
5803 	return dlil_input_packet_list_common(ifp, m, 0,
5804 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5805 }
5806 
5807 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5808 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5809     u_int32_t cnt, ifnet_model_t mode)
5810 {
5811 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5812 }
5813 
5814 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5815 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5816     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5817 {
5818 	int error = 0;
5819 	protocol_family_t protocol_family;
5820 	mbuf_t next_packet;
5821 	ifnet_t ifp = ifp_param;
5822 	char *frame_header = NULL;
5823 	struct if_proto *last_ifproto = NULL;
5824 	mbuf_t pkt_first = NULL;
5825 	mbuf_t *pkt_next = NULL;
5826 	u_int32_t poll_thresh = 0, poll_ival = 0;
5827 	int iorefcnt = 0;
5828 
5829 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5830 
5831 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5832 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
5833 		poll_thresh = cnt;
5834 	}
5835 
5836 	while (m != NULL) {
5837 		struct if_proto *ifproto = NULL;
5838 		uint32_t pktf_mask;     /* pkt flags to preserve */
5839 
5840 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5841 
5842 		if (ifp_param == NULL) {
5843 			ifp = m->m_pkthdr.rcvif;
5844 		}
5845 
5846 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
5847 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5848 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5849 			ifnet_poll(ifp);
5850 		}
5851 
5852 		/* Check if this mbuf looks valid */
5853 		MBUF_INPUT_CHECK(m, ifp);
5854 
5855 		next_packet = m->m_nextpkt;
5856 		m->m_nextpkt = NULL;
5857 		frame_header = m->m_pkthdr.pkt_hdr;
5858 		m->m_pkthdr.pkt_hdr = NULL;
5859 
5860 		/*
5861 		 * Get an IO reference count if the interface is not
5862 		 * loopback (lo0) and it is attached; lo0 never goes
5863 		 * away, so optimize for that.
5864 		 */
5865 		if (ifp != lo_ifp) {
5866 			/* iorefcnt is 0 if it hasn't been taken yet */
5867 			if (iorefcnt == 0) {
5868 				if (!ifnet_datamov_begin(ifp)) {
5869 					m_freem(m);
5870 					goto next;
5871 				}
5872 			}
5873 			iorefcnt = 1;
5874 			/*
5875 			 * Preserve the time stamp and skip pktap flags.
5876 			 */
5877 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5878 		} else {
5879 			/*
5880 			 * If this arrived on lo0, preserve interface addr
5881 			 * info to allow for connectivity between loopback
5882 			 * and local interface addresses.
5883 			 */
5884 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5885 		}
5886 		pktf_mask |= PKTF_WAKE_PKT;
5887 
5888 		/* make sure packet comes in clean */
5889 		m_classifier_init(m, pktf_mask);
5890 
5891 		ifp_inc_traffic_class_in(ifp, m);
5892 
5893 		/* find which protocol family this packet is for */
5894 		ifnet_lock_shared(ifp);
5895 		error = (*ifp->if_demux)(ifp, m, frame_header,
5896 		    &protocol_family);
5897 		ifnet_lock_done(ifp);
5898 		if (error != 0) {
5899 			if (error == EJUSTRETURN) {
5900 				goto next;
5901 			}
5902 			protocol_family = 0;
5903 		}
5904 
5905 #if (DEVELOPMENT || DEBUG)
5906 		/*
5907 		 * For testing we do not care about broadcast and multicast packets as
5908 		 * they are not as controllable as unicast traffic
5909 		 */
5910 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
5911 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
5912 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
5913 				/*
5914 				 * This is a one-shot command
5915 				 */
5916 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
5917 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
5918 			}
5919 		}
5920 #endif /* (DEVELOPMENT || DEBUG) */
5921 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
5922 			char buffer[64];
5923 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
5924 
5925 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
5926 			    ifp->if_xname, m_pktlen(m));
5927 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
5928 				log_hexdump(buffer, buflen);
5929 			}
5930 		}
5931 
5932 		pktap_input(ifp, protocol_family, m, frame_header);
5933 
5934 		/* Drop v4 packets received on CLAT46 enabled cell interface */
5935 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
5936 		    ifp->if_type == IFT_CELLULAR) {
5937 			m_freem(m);
5938 			ip6stat.ip6s_clat464_in_v4_drop++;
5939 			goto next;
5940 		}
5941 
5942 		/* Translate the packet if it is received on CLAT interface */
5943 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
5944 		    && dlil_is_clat_needed(protocol_family, m)) {
5945 			char *data = NULL;
5946 			struct ether_header eh;
5947 			struct ether_header *ehp = NULL;
5948 
5949 			if (ifp->if_type == IFT_ETHER) {
5950 				ehp = (struct ether_header *)(void *)frame_header;
5951 				/* Skip RX Ethernet packets if they are not IPV6 */
5952 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
5953 					goto skip_clat;
5954 				}
5955 
5956 				/* Keep a copy of frame_header for Ethernet packets */
5957 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
5958 			}
5959 			error = dlil_clat64(ifp, &protocol_family, &m);
5960 			data = (char *) mbuf_data(m);
5961 			if (error != 0) {
5962 				m_freem(m);
5963 				ip6stat.ip6s_clat464_in_drop++;
5964 				goto next;
5965 			}
5966 			/* Native v6 should be No-op */
5967 			if (protocol_family != PF_INET) {
5968 				goto skip_clat;
5969 			}
5970 
5971 			/* Do this only for translated v4 packets. */
5972 			switch (ifp->if_type) {
5973 			case IFT_CELLULAR:
5974 				frame_header = data;
5975 				break;
5976 			case IFT_ETHER:
5977 				/*
5978 				 * Drop if the mbuf doesn't have enough
5979 				 * space for Ethernet header
5980 				 */
5981 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
5982 					m_free(m);
5983 					ip6stat.ip6s_clat464_in_drop++;
5984 					goto next;
5985 				}
5986 				/*
5987 				 * Set the frame_header ETHER_HDR_LEN bytes
5988 				 * preceeding the data pointer. Change
5989 				 * the ether_type too.
5990 				 */
5991 				frame_header = data - ETHER_HDR_LEN;
5992 				eh.ether_type = htons(ETHERTYPE_IP);
5993 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
5994 				break;
5995 			}
5996 		}
5997 skip_clat:
5998 		/*
5999 		 * Match the wake packet against the list of ports that has been
6000 		 * been queried by the driver before the device went to sleep
6001 		 */
6002 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6003 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6004 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6005 			}
6006 		}
6007 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6008 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6009 			dlil_input_cksum_dbg(ifp, m, frame_header,
6010 			    protocol_family);
6011 		}
6012 		/*
6013 		 * For partial checksum offload, we expect the driver to
6014 		 * set the start offset indicating the start of the span
6015 		 * that is covered by the hardware-computed checksum;
6016 		 * adjust this start offset accordingly because the data
6017 		 * pointer has been advanced beyond the link-layer header.
6018 		 *
6019 		 * Virtual lan types (bridge, vlan, bond) can call
6020 		 * dlil_input_packet_list() with the same packet with the
6021 		 * checksum flags set. Set a flag indicating that the
6022 		 * adjustment has already been done.
6023 		 */
6024 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6025 			/* adjustment has already been done */
6026 		} else if ((m->m_pkthdr.csum_flags &
6027 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6028 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6029 			int adj;
6030 			if (frame_header == NULL ||
6031 			    frame_header < (char *)mbuf_datastart(m) ||
6032 			    frame_header > (char *)m->m_data ||
6033 			    (adj = (int)(m->m_data - frame_header)) >
6034 			    m->m_pkthdr.csum_rx_start) {
6035 				m->m_pkthdr.csum_data = 0;
6036 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6037 				hwcksum_in_invalidated++;
6038 			} else {
6039 				m->m_pkthdr.csum_rx_start -= adj;
6040 			}
6041 			/* make sure we don't adjust more than once */
6042 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6043 		}
6044 		if (clat_debug) {
6045 			pktap_input(ifp, protocol_family, m, frame_header);
6046 		}
6047 
6048 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6049 			atomic_add_64(&ifp->if_imcasts, 1);
6050 		}
6051 
6052 		/* run interface filters */
6053 		error = dlil_interface_filters_input(ifp, &m,
6054 		    &frame_header, protocol_family);
6055 		if (error != 0) {
6056 			if (error != EJUSTRETURN) {
6057 				m_freem(m);
6058 			}
6059 			goto next;
6060 		}
6061 		/*
6062 		 * A VLAN interface receives VLAN-tagged packets by attaching
6063 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6064 		 * interface is a member of a bridge, the parent interface
6065 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6066 		 * M_PROMISC packet must be processed by the VLAN protocol
6067 		 * so that it can be sent up the stack via
6068 		 * dlil_input_packet_list(). That allows the bridge interface's
6069 		 * input filter, attached to the VLAN interface, to process
6070 		 * the packet.
6071 		 */
6072 		if (protocol_family != PF_VLAN &&
6073 		    (m->m_flags & M_PROMISC) != 0) {
6074 			m_freem(m);
6075 			goto next;
6076 		}
6077 
6078 		/* Lookup the protocol attachment to this interface */
6079 		if (protocol_family == 0) {
6080 			ifproto = NULL;
6081 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6082 		    (last_ifproto->protocol_family == protocol_family)) {
6083 			VERIFY(ifproto == NULL);
6084 			ifproto = last_ifproto;
6085 			if_proto_ref(last_ifproto);
6086 		} else {
6087 			VERIFY(ifproto == NULL);
6088 			ifnet_lock_shared(ifp);
6089 			/* callee holds a proto refcnt upon success */
6090 			ifproto = find_attached_proto(ifp, protocol_family);
6091 			ifnet_lock_done(ifp);
6092 		}
6093 		if (ifproto == NULL) {
6094 			/* no protocol for this packet, discard */
6095 			m_freem(m);
6096 			goto next;
6097 		}
6098 		if (ifproto != last_ifproto) {
6099 			if (last_ifproto != NULL) {
6100 				/* pass up the list for the previous protocol */
6101 				dlil_ifproto_input(last_ifproto, pkt_first);
6102 				pkt_first = NULL;
6103 				if_proto_free(last_ifproto);
6104 			}
6105 			last_ifproto = ifproto;
6106 			if_proto_ref(ifproto);
6107 		}
6108 		/* extend the list */
6109 		m->m_pkthdr.pkt_hdr = frame_header;
6110 		if (pkt_first == NULL) {
6111 			pkt_first = m;
6112 		} else {
6113 			*pkt_next = m;
6114 		}
6115 		pkt_next = &m->m_nextpkt;
6116 
6117 next:
6118 		if (next_packet == NULL && last_ifproto != NULL) {
6119 			/* pass up the last list of packets */
6120 			dlil_ifproto_input(last_ifproto, pkt_first);
6121 			if_proto_free(last_ifproto);
6122 			last_ifproto = NULL;
6123 		}
6124 		if (ifproto != NULL) {
6125 			if_proto_free(ifproto);
6126 			ifproto = NULL;
6127 		}
6128 
6129 		m = next_packet;
6130 
6131 		/* update the driver's multicast filter, if needed */
6132 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6133 			ifp->if_updatemcasts = 0;
6134 		}
6135 		if (iorefcnt == 1) {
6136 			/* If the next mbuf is on a different interface, unlock data-mov */
6137 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6138 				ifnet_datamov_end(ifp);
6139 				iorefcnt = 0;
6140 			}
6141 		}
6142 	}
6143 
6144 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6145 }
6146 
6147 errno_t
if_mcasts_update(struct ifnet * ifp)6148 if_mcasts_update(struct ifnet *ifp)
6149 {
6150 	errno_t err;
6151 
6152 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6153 	if (err == EAFNOSUPPORT) {
6154 		err = 0;
6155 	}
6156 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6157 	    "(err=%d)\n", if_name(ifp),
6158 	    (err == 0 ? "successfully restored" : "failed to restore"),
6159 	    ifp->if_updatemcasts, err);
6160 
6161 	/* just return success */
6162 	return 0;
6163 }
6164 
6165 /* If ifp is set, we will increment the generation for the interface */
6166 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6167 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6168 {
6169 	if (ifp != NULL) {
6170 		ifnet_increment_generation(ifp);
6171 	}
6172 
6173 #if NECP
6174 	necp_update_all_clients();
6175 #endif /* NECP */
6176 
6177 	return kev_post_msg(event);
6178 }
6179 
6180 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6181 dlil_post_sifflags_msg(struct ifnet * ifp)
6182 {
6183 	struct kev_msg ev_msg;
6184 	struct net_event_data ev_data;
6185 
6186 	bzero(&ev_data, sizeof(ev_data));
6187 	bzero(&ev_msg, sizeof(ev_msg));
6188 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6189 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6190 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6191 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6192 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6193 	ev_data.if_family = ifp->if_family;
6194 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6195 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6196 	ev_msg.dv[0].data_ptr = &ev_data;
6197 	ev_msg.dv[1].data_length = 0;
6198 	dlil_post_complete_msg(ifp, &ev_msg);
6199 }
6200 
6201 #define TMP_IF_PROTO_ARR_SIZE   10
6202 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6203 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6204 {
6205 	struct ifnet_filter *filter = NULL;
6206 	struct if_proto *proto = NULL;
6207 	int if_proto_count = 0;
6208 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6209 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6210 	int tmp_ifproto_arr_idx = 0;
6211 
6212 	/*
6213 	 * Pass the event to the interface filters
6214 	 */
6215 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6216 	/* prevent filter list from changing in case we drop the lock */
6217 	if_flt_monitor_busy(ifp);
6218 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6219 		if (filter->filt_event != NULL) {
6220 			lck_mtx_unlock(&ifp->if_flt_lock);
6221 
6222 			filter->filt_event(filter->filt_cookie, ifp,
6223 			    filter->filt_protocol, event);
6224 
6225 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6226 		}
6227 	}
6228 	/* we're done with the filter list */
6229 	if_flt_monitor_unbusy(ifp);
6230 	lck_mtx_unlock(&ifp->if_flt_lock);
6231 
6232 	/* Get an io ref count if the interface is attached */
6233 	if (!ifnet_is_attached(ifp, 1)) {
6234 		goto done;
6235 	}
6236 
6237 	/*
6238 	 * An embedded tmp_list_entry in if_proto may still get
6239 	 * over-written by another thread after giving up ifnet lock,
6240 	 * therefore we are avoiding embedded pointers here.
6241 	 */
6242 	ifnet_lock_shared(ifp);
6243 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6244 	if (if_proto_count) {
6245 		int i;
6246 		VERIFY(ifp->if_proto_hash != NULL);
6247 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6248 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6249 		} else {
6250 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6251 			    if_proto_count, Z_WAITOK | Z_ZERO);
6252 			if (tmp_ifproto_arr == NULL) {
6253 				ifnet_lock_done(ifp);
6254 				goto cleanup;
6255 			}
6256 		}
6257 
6258 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6259 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6260 			    next_hash) {
6261 				if_proto_ref(proto);
6262 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6263 				tmp_ifproto_arr_idx++;
6264 			}
6265 		}
6266 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6267 	}
6268 	ifnet_lock_done(ifp);
6269 
6270 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6271 	    tmp_ifproto_arr_idx++) {
6272 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6273 		VERIFY(proto != NULL);
6274 		proto_media_event eventp =
6275 		    (proto->proto_kpi == kProtoKPI_v1 ?
6276 		    proto->kpi.v1.event :
6277 		    proto->kpi.v2.event);
6278 
6279 		if (eventp != NULL) {
6280 			eventp(ifp, proto->protocol_family,
6281 			    event);
6282 		}
6283 		if_proto_free(proto);
6284 	}
6285 
6286 cleanup:
6287 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6288 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6289 	}
6290 
6291 	/* Pass the event to the interface */
6292 	if (ifp->if_event != NULL) {
6293 		ifp->if_event(ifp, event);
6294 	}
6295 
6296 	/* Release the io ref count */
6297 	ifnet_decr_iorefcnt(ifp);
6298 done:
6299 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6300 }
6301 
6302 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6303 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6304 {
6305 	struct kev_msg kev_msg;
6306 	int result = 0;
6307 
6308 	if (ifp == NULL || event == NULL) {
6309 		return EINVAL;
6310 	}
6311 
6312 	bzero(&kev_msg, sizeof(kev_msg));
6313 	kev_msg.vendor_code = event->vendor_code;
6314 	kev_msg.kev_class = event->kev_class;
6315 	kev_msg.kev_subclass = event->kev_subclass;
6316 	kev_msg.event_code = event->event_code;
6317 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6318 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6319 	kev_msg.dv[1].data_length = 0;
6320 
6321 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6322 
6323 	return result;
6324 }
6325 
6326 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6327 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6328 {
6329 	mbuf_t  n = m;
6330 	int chainlen = 0;
6331 
6332 	while (n != NULL) {
6333 		chainlen++;
6334 		n = n->m_next;
6335 	}
6336 	switch (chainlen) {
6337 	case 0:
6338 		break;
6339 	case 1:
6340 		atomic_add_64(&cls->cls_one, 1);
6341 		break;
6342 	case 2:
6343 		atomic_add_64(&cls->cls_two, 1);
6344 		break;
6345 	case 3:
6346 		atomic_add_64(&cls->cls_three, 1);
6347 		break;
6348 	case 4:
6349 		atomic_add_64(&cls->cls_four, 1);
6350 		break;
6351 	case 5:
6352 	default:
6353 		atomic_add_64(&cls->cls_five_or_more, 1);
6354 		break;
6355 	}
6356 }
6357 
6358 #if CONFIG_DTRACE
6359 __attribute__((noinline))
6360 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6361 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6362 {
6363 	if (proto_family == PF_INET) {
6364 		struct ip *ip = mtod(m, struct ip *);
6365 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6366 		    struct ip *, ip, struct ifnet *, ifp,
6367 		    struct ip *, ip, struct ip6_hdr *, NULL);
6368 	} else if (proto_family == PF_INET6) {
6369 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6370 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6371 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6372 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6373 	}
6374 }
6375 #endif /* CONFIG_DTRACE */
6376 
6377 /*
6378  * dlil_output
6379  *
6380  * Caller should have a lock on the protocol domain if the protocol
6381  * doesn't support finer grained locking. In most cases, the lock
6382  * will be held from the socket layer and won't be released until
6383  * we return back to the socket layer.
6384  *
6385  * This does mean that we must take a protocol lock before we take
6386  * an interface lock if we're going to take both. This makes sense
6387  * because a protocol is likely to interact with an ifp while it
6388  * is under the protocol lock.
6389  *
6390  * An advisory code will be returned if adv is not null. This
6391  * can be used to provide feedback about interface queues to the
6392  * application.
6393  */
6394 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6395 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6396     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6397 {
6398 	char *frame_type = NULL;
6399 	char *dst_linkaddr = NULL;
6400 	int retval = 0;
6401 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6402 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6403 	struct if_proto *proto = NULL;
6404 	mbuf_t  m = NULL;
6405 	mbuf_t  send_head = NULL;
6406 	mbuf_t  *send_tail = &send_head;
6407 	int iorefcnt = 0;
6408 	u_int32_t pre = 0, post = 0;
6409 	u_int32_t fpkts = 0, fbytes = 0;
6410 	int32_t flen = 0;
6411 	struct timespec now;
6412 	u_int64_t now_nsec;
6413 	boolean_t did_clat46 = FALSE;
6414 	protocol_family_t old_proto_family = proto_family;
6415 	struct sockaddr_in6 dest6;
6416 	struct rtentry *rt = NULL;
6417 	u_int32_t m_loop_set = 0;
6418 
6419 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6420 
6421 	/*
6422 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6423 	 * from happening while this operation is in progress
6424 	 */
6425 	if (!ifnet_datamov_begin(ifp)) {
6426 		retval = ENXIO;
6427 		goto cleanup;
6428 	}
6429 	iorefcnt = 1;
6430 
6431 	VERIFY(ifp->if_output_dlil != NULL);
6432 
6433 	/* update the driver's multicast filter, if needed */
6434 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6435 		ifp->if_updatemcasts = 0;
6436 	}
6437 
6438 	frame_type = frame_type_buffer;
6439 	dst_linkaddr = dst_linkaddr_buffer;
6440 
6441 	if (raw == 0) {
6442 		ifnet_lock_shared(ifp);
6443 		/* callee holds a proto refcnt upon success */
6444 		proto = find_attached_proto(ifp, proto_family);
6445 		if (proto == NULL) {
6446 			ifnet_lock_done(ifp);
6447 			retval = ENXIO;
6448 			goto cleanup;
6449 		}
6450 		ifnet_lock_done(ifp);
6451 	}
6452 
6453 preout_again:
6454 	if (packetlist == NULL) {
6455 		goto cleanup;
6456 	}
6457 
6458 	m = packetlist;
6459 	packetlist = packetlist->m_nextpkt;
6460 	m->m_nextpkt = NULL;
6461 
6462 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6463 
6464 	/*
6465 	 * Perform address family translation for the first
6466 	 * packet outside the loop in order to perform address
6467 	 * lookup for the translated proto family.
6468 	 */
6469 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6470 	    (ifp->if_type == IFT_CELLULAR ||
6471 	    dlil_is_clat_needed(proto_family, m))) {
6472 		retval = dlil_clat46(ifp, &proto_family, &m);
6473 		/*
6474 		 * Go to the next packet if translation fails
6475 		 */
6476 		if (retval != 0) {
6477 			m_freem(m);
6478 			m = NULL;
6479 			ip6stat.ip6s_clat464_out_drop++;
6480 			/* Make sure that the proto family is PF_INET */
6481 			ASSERT(proto_family == PF_INET);
6482 			goto preout_again;
6483 		}
6484 		/*
6485 		 * Free the old one and make it point to the IPv6 proto structure.
6486 		 *
6487 		 * Change proto for the first time we have successfully
6488 		 * performed address family translation.
6489 		 */
6490 		if (!did_clat46 && proto_family == PF_INET6) {
6491 			did_clat46 = TRUE;
6492 
6493 			if (proto != NULL) {
6494 				if_proto_free(proto);
6495 			}
6496 			ifnet_lock_shared(ifp);
6497 			/* callee holds a proto refcnt upon success */
6498 			proto = find_attached_proto(ifp, proto_family);
6499 			if (proto == NULL) {
6500 				ifnet_lock_done(ifp);
6501 				retval = ENXIO;
6502 				m_freem(m);
6503 				m = NULL;
6504 				goto cleanup;
6505 			}
6506 			ifnet_lock_done(ifp);
6507 			if (ifp->if_type == IFT_ETHER) {
6508 				/* Update the dest to translated v6 address */
6509 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6510 				dest6.sin6_family = AF_INET6;
6511 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6512 				dest = (const struct sockaddr *)&dest6;
6513 
6514 				/*
6515 				 * Lookup route to the translated destination
6516 				 * Free this route ref during cleanup
6517 				 */
6518 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6519 				    0, 0, ifp->if_index);
6520 
6521 				route = rt;
6522 			}
6523 		}
6524 	}
6525 
6526 	/*
6527 	 * This path gets packet chain going to the same destination.
6528 	 * The pre output routine is used to either trigger resolution of
6529 	 * the next hop or retreive the next hop's link layer addressing.
6530 	 * For ex: ether_inet(6)_pre_output routine.
6531 	 *
6532 	 * If the routine returns EJUSTRETURN, it implies that packet has
6533 	 * been queued, and therefore we have to call preout_again for the
6534 	 * following packet in the chain.
6535 	 *
6536 	 * For errors other than EJUSTRETURN, the current packet is freed
6537 	 * and the rest of the chain (pointed by packetlist is freed as
6538 	 * part of clean up.
6539 	 *
6540 	 * Else if there is no error the retrieved information is used for
6541 	 * all the packets in the chain.
6542 	 */
6543 	if (raw == 0) {
6544 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6545 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6546 		retval = 0;
6547 		if (preoutp != NULL) {
6548 			retval = preoutp(ifp, proto_family, &m, dest, route,
6549 			    frame_type, dst_linkaddr);
6550 
6551 			if (retval != 0) {
6552 				if (retval == EJUSTRETURN) {
6553 					goto preout_again;
6554 				}
6555 				m_freem(m);
6556 				m = NULL;
6557 				goto cleanup;
6558 			}
6559 		}
6560 	}
6561 
6562 	do {
6563 		/*
6564 		 * pkt_hdr is set here to point to m_data prior to
6565 		 * calling into the framer. This value of pkt_hdr is
6566 		 * used by the netif gso logic to retrieve the ip header
6567 		 * for the TCP packets, offloaded for TSO processing.
6568 		 */
6569 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6570 			uint8_t vlan_encap_len = 0;
6571 
6572 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6573 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6574 			}
6575 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6576 		} else {
6577 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6578 		}
6579 
6580 		/*
6581 		 * Perform address family translation if needed.
6582 		 * For now we only support stateless 4 to 6 translation
6583 		 * on the out path.
6584 		 *
6585 		 * The routine below translates IP header, updates protocol
6586 		 * checksum and also translates ICMP.
6587 		 *
6588 		 * We skip the first packet as it is already translated and
6589 		 * the proto family is set to PF_INET6.
6590 		 */
6591 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6592 		    (ifp->if_type == IFT_CELLULAR ||
6593 		    dlil_is_clat_needed(proto_family, m))) {
6594 			retval = dlil_clat46(ifp, &proto_family, &m);
6595 			/* Goto the next packet if the translation fails */
6596 			if (retval != 0) {
6597 				m_freem(m);
6598 				m = NULL;
6599 				ip6stat.ip6s_clat464_out_drop++;
6600 				goto next;
6601 			}
6602 		}
6603 
6604 #if CONFIG_DTRACE
6605 		if (!raw) {
6606 			dlil_output_dtrace(ifp, proto_family, m);
6607 		}
6608 #endif /* CONFIG_DTRACE */
6609 
6610 		if (raw == 0 && ifp->if_framer != NULL) {
6611 			int rcvif_set = 0;
6612 
6613 			/*
6614 			 * If this is a broadcast packet that needs to be
6615 			 * looped back into the system, set the inbound ifp
6616 			 * to that of the outbound ifp.  This will allow
6617 			 * us to determine that it is a legitimate packet
6618 			 * for the system.  Only set the ifp if it's not
6619 			 * already set, just to be safe.
6620 			 */
6621 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6622 			    m->m_pkthdr.rcvif == NULL) {
6623 				m->m_pkthdr.rcvif = ifp;
6624 				rcvif_set = 1;
6625 			}
6626 			m_loop_set = m->m_flags & M_LOOP;
6627 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6628 			    frame_type, &pre, &post);
6629 			if (retval != 0) {
6630 				if (retval != EJUSTRETURN) {
6631 					m_freem(m);
6632 				}
6633 				goto next;
6634 			}
6635 
6636 			/*
6637 			 * For partial checksum offload, adjust the start
6638 			 * and stuff offsets based on the prepended header.
6639 			 */
6640 			if ((m->m_pkthdr.csum_flags &
6641 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6642 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6643 				m->m_pkthdr.csum_tx_stuff += pre;
6644 				m->m_pkthdr.csum_tx_start += pre;
6645 			}
6646 
6647 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6648 				dlil_output_cksum_dbg(ifp, m, pre,
6649 				    proto_family);
6650 			}
6651 
6652 			/*
6653 			 * Clear the ifp if it was set above, and to be
6654 			 * safe, only if it is still the same as the
6655 			 * outbound ifp we have in context.  If it was
6656 			 * looped back, then a copy of it was sent to the
6657 			 * loopback interface with the rcvif set, and we
6658 			 * are clearing the one that will go down to the
6659 			 * layer below.
6660 			 */
6661 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6662 				m->m_pkthdr.rcvif = NULL;
6663 			}
6664 		}
6665 
6666 		/*
6667 		 * Let interface filters (if any) do their thing ...
6668 		 */
6669 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6670 		if (retval != 0) {
6671 			if (retval != EJUSTRETURN) {
6672 				m_freem(m);
6673 			}
6674 			goto next;
6675 		}
6676 		/*
6677 		 * Strip away M_PROTO1 bit prior to sending packet
6678 		 * to the driver as this field may be used by the driver
6679 		 */
6680 		m->m_flags &= ~M_PROTO1;
6681 
6682 		/*
6683 		 * If the underlying interface is not capable of handling a
6684 		 * packet whose data portion spans across physically disjoint
6685 		 * pages, we need to "normalize" the packet so that we pass
6686 		 * down a chain of mbufs where each mbuf points to a span that
6687 		 * resides in the system page boundary.  If the packet does
6688 		 * not cross page(s), the following is a no-op.
6689 		 */
6690 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6691 			if ((m = m_normalize(m)) == NULL) {
6692 				goto next;
6693 			}
6694 		}
6695 
6696 		/*
6697 		 * If this is a TSO packet, make sure the interface still
6698 		 * advertise TSO capability.
6699 		 */
6700 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6701 			retval = EMSGSIZE;
6702 			m_freem(m);
6703 			goto cleanup;
6704 		}
6705 
6706 		ifp_inc_traffic_class_out(ifp, m);
6707 
6708 #if SKYWALK
6709 		/*
6710 		 * For native skywalk devices, packets will be passed to pktap
6711 		 * after GSO or after the mbuf to packet conversion.
6712 		 * This is done for IPv4/IPv6 packets only because there is no
6713 		 * space in the mbuf to pass down the proto family.
6714 		 */
6715 		if (dlil_is_native_netif_nexus(ifp)) {
6716 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6717 				pktap_output(ifp, proto_family, m, pre, post);
6718 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6719 			}
6720 		} else {
6721 			pktap_output(ifp, proto_family, m, pre, post);
6722 		}
6723 #else /* SKYWALK */
6724 		pktap_output(ifp, proto_family, m, pre, post);
6725 #endif /* SKYWALK */
6726 
6727 		/*
6728 		 * Count the number of elements in the mbuf chain
6729 		 */
6730 		if (tx_chain_len_count) {
6731 			dlil_count_chain_len(m, &tx_chain_len_stats);
6732 		}
6733 
6734 		/*
6735 		 * Record timestamp; ifnet_enqueue() will use this info
6736 		 * rather than redoing the work.  An optimization could
6737 		 * involve doing this just once at the top, if there are
6738 		 * no interface filters attached, but that's probably
6739 		 * not a big deal.
6740 		 */
6741 		nanouptime(&now);
6742 		net_timernsec(&now, &now_nsec);
6743 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6744 
6745 		/*
6746 		 * Discard partial sum information if this packet originated
6747 		 * from another interface; the packet would already have the
6748 		 * final checksum and we shouldn't recompute it.
6749 		 */
6750 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6751 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6752 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6753 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6754 			m->m_pkthdr.csum_data = 0;
6755 		}
6756 
6757 		/*
6758 		 * Finally, call the driver.
6759 		 */
6760 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6761 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6762 				flen += (m_pktlen(m) - (pre + post));
6763 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6764 			}
6765 			*send_tail = m;
6766 			send_tail = &m->m_nextpkt;
6767 		} else {
6768 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6769 				flen = (m_pktlen(m) - (pre + post));
6770 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6771 			} else {
6772 				flen = 0;
6773 			}
6774 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6775 			    0, 0, 0, 0, 0);
6776 			retval = (*ifp->if_output_dlil)(ifp, m);
6777 			if (retval == EQFULL || retval == EQSUSPENDED) {
6778 				if (adv != NULL && adv->code == FADV_SUCCESS) {
6779 					adv->code = (retval == EQFULL ?
6780 					    FADV_FLOW_CONTROLLED :
6781 					    FADV_SUSPENDED);
6782 				}
6783 				retval = 0;
6784 			}
6785 			if (retval == 0 && flen > 0) {
6786 				fbytes += flen;
6787 				fpkts++;
6788 			}
6789 			if (retval != 0 && dlil_verbose) {
6790 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6791 				    __func__, if_name(ifp),
6792 				    retval);
6793 			}
6794 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6795 			    0, 0, 0, 0, 0);
6796 		}
6797 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6798 
6799 next:
6800 		m = packetlist;
6801 		if (m != NULL) {
6802 			m->m_flags |= m_loop_set;
6803 			packetlist = packetlist->m_nextpkt;
6804 			m->m_nextpkt = NULL;
6805 		}
6806 		/* Reset the proto family to old proto family for CLAT */
6807 		if (did_clat46) {
6808 			proto_family = old_proto_family;
6809 		}
6810 	} while (m != NULL);
6811 
6812 	if (send_head != NULL) {
6813 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6814 		    0, 0, 0, 0, 0);
6815 		if (ifp->if_eflags & IFEF_SENDLIST) {
6816 			retval = (*ifp->if_output_dlil)(ifp, send_head);
6817 			if (retval == EQFULL || retval == EQSUSPENDED) {
6818 				if (adv != NULL) {
6819 					adv->code = (retval == EQFULL ?
6820 					    FADV_FLOW_CONTROLLED :
6821 					    FADV_SUSPENDED);
6822 				}
6823 				retval = 0;
6824 			}
6825 			if (retval == 0 && flen > 0) {
6826 				fbytes += flen;
6827 				fpkts++;
6828 			}
6829 			if (retval != 0 && dlil_verbose) {
6830 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6831 				    __func__, if_name(ifp), retval);
6832 			}
6833 		} else {
6834 			struct mbuf *send_m;
6835 			int enq_cnt = 0;
6836 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6837 			while (send_head != NULL) {
6838 				send_m = send_head;
6839 				send_head = send_m->m_nextpkt;
6840 				send_m->m_nextpkt = NULL;
6841 				retval = (*ifp->if_output_dlil)(ifp, send_m);
6842 				if (retval == EQFULL || retval == EQSUSPENDED) {
6843 					if (adv != NULL) {
6844 						adv->code = (retval == EQFULL ?
6845 						    FADV_FLOW_CONTROLLED :
6846 						    FADV_SUSPENDED);
6847 					}
6848 					retval = 0;
6849 				}
6850 				if (retval == 0) {
6851 					enq_cnt++;
6852 					if (flen > 0) {
6853 						fpkts++;
6854 					}
6855 				}
6856 				if (retval != 0 && dlil_verbose) {
6857 					DLIL_PRINTF("%s: output error on %s "
6858 					    "retval = %d\n",
6859 					    __func__, if_name(ifp), retval);
6860 				}
6861 			}
6862 			if (enq_cnt > 0) {
6863 				fbytes += flen;
6864 				ifnet_start(ifp);
6865 			}
6866 		}
6867 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6868 	}
6869 
6870 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6871 
6872 cleanup:
6873 	if (fbytes > 0) {
6874 		ifp->if_fbytes += fbytes;
6875 	}
6876 	if (fpkts > 0) {
6877 		ifp->if_fpackets += fpkts;
6878 	}
6879 	if (proto != NULL) {
6880 		if_proto_free(proto);
6881 	}
6882 	if (packetlist) { /* if any packets are left, clean up */
6883 		mbuf_freem_list(packetlist);
6884 	}
6885 	if (retval == EJUSTRETURN) {
6886 		retval = 0;
6887 	}
6888 	if (iorefcnt == 1) {
6889 		ifnet_datamov_end(ifp);
6890 	}
6891 	if (rt != NULL) {
6892 		rtfree(rt);
6893 		rt = NULL;
6894 	}
6895 
6896 	return retval;
6897 }
6898 
6899 /*
6900  * This routine checks if the destination address is not a loopback, link-local,
6901  * multicast or broadcast address.
6902  */
6903 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)6904 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
6905 {
6906 	int ret = 0;
6907 	switch (proto_family) {
6908 	case PF_INET: {
6909 		struct ip *iph = mtod(m, struct ip *);
6910 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
6911 			ret = 1;
6912 		}
6913 		break;
6914 	}
6915 	case PF_INET6: {
6916 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
6917 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
6918 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
6919 			ret = 1;
6920 		}
6921 		break;
6922 	}
6923 	}
6924 
6925 	return ret;
6926 }
6927 /*
6928  * @brief This routine translates IPv4 packet to IPv6 packet,
6929  *     updates protocol checksum and also translates ICMP for code
6930  *     along with inner header translation.
6931  *
6932  * @param ifp Pointer to the interface
6933  * @param proto_family pointer to protocol family. It is updated if function
6934  *     performs the translation successfully.
6935  * @param m Pointer to the pointer pointing to the packet. Needed because this
6936  *     routine can end up changing the mbuf to a different one.
6937  *
6938  * @return 0 on success or else a negative value.
6939  */
6940 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)6941 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
6942 {
6943 	VERIFY(*proto_family == PF_INET);
6944 	VERIFY(IS_INTF_CLAT46(ifp));
6945 
6946 	pbuf_t pbuf_store, *pbuf = NULL;
6947 	struct ip *iph = NULL;
6948 	struct in_addr osrc, odst;
6949 	uint8_t proto = 0;
6950 	struct in6_ifaddr *ia6_clat_src = NULL;
6951 	struct in6_addr *src = NULL;
6952 	struct in6_addr dst;
6953 	int error = 0;
6954 	uint16_t off = 0;
6955 	uint16_t tot_len = 0;
6956 	uint16_t ip_id_val = 0;
6957 	uint16_t ip_frag_off = 0;
6958 
6959 	boolean_t is_frag = FALSE;
6960 	boolean_t is_first_frag = TRUE;
6961 	boolean_t is_last_frag = TRUE;
6962 
6963 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
6964 	pbuf = &pbuf_store;
6965 	iph = pbuf->pb_data;
6966 
6967 	osrc = iph->ip_src;
6968 	odst = iph->ip_dst;
6969 	proto = iph->ip_p;
6970 	off = (uint16_t)(iph->ip_hl << 2);
6971 	ip_id_val = iph->ip_id;
6972 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
6973 
6974 	tot_len = ntohs(iph->ip_len);
6975 
6976 	/*
6977 	 * For packets that are not first frags
6978 	 * we only need to adjust CSUM.
6979 	 * For 4 to 6, Fragmentation header gets appended
6980 	 * after proto translation.
6981 	 */
6982 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
6983 		is_frag = TRUE;
6984 
6985 		/* If the offset is not zero, it is not first frag */
6986 		if (ip_frag_off != 0) {
6987 			is_first_frag = FALSE;
6988 		}
6989 
6990 		/* If IP_MF is set, then it is not last frag */
6991 		if (ntohs(iph->ip_off) & IP_MF) {
6992 			is_last_frag = FALSE;
6993 		}
6994 	}
6995 
6996 	/*
6997 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
6998 	 * translation.
6999 	 */
7000 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7001 	if (ia6_clat_src == NULL) {
7002 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7003 		error = -1;
7004 		goto cleanup;
7005 	}
7006 
7007 	src = &ia6_clat_src->ia_addr.sin6_addr;
7008 
7009 	/*
7010 	 * Translate IPv4 destination to IPv6 destination by using the
7011 	 * prefixes learned through prior PLAT discovery.
7012 	 */
7013 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7014 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7015 		goto cleanup;
7016 	}
7017 
7018 	/* Translate the IP header part first */
7019 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7020 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7021 
7022 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7023 
7024 	if (error != 0) {
7025 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7026 		goto cleanup;
7027 	}
7028 
7029 	/*
7030 	 * Translate protocol header, update checksum, checksum flags
7031 	 * and related fields.
7032 	 */
7033 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7034 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7035 
7036 	if (error != 0) {
7037 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7038 		goto cleanup;
7039 	}
7040 
7041 	/* Now insert the IPv6 fragment header */
7042 	if (is_frag) {
7043 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7044 
7045 		if (error != 0) {
7046 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7047 			goto cleanup;
7048 		}
7049 	}
7050 
7051 cleanup:
7052 	if (ia6_clat_src != NULL) {
7053 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7054 	}
7055 
7056 	if (pbuf_is_valid(pbuf)) {
7057 		*m = pbuf->pb_mbuf;
7058 		pbuf->pb_mbuf = NULL;
7059 		pbuf_destroy(pbuf);
7060 	} else {
7061 		error = -1;
7062 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7063 	}
7064 
7065 	if (error == 0) {
7066 		*proto_family = PF_INET6;
7067 		ip6stat.ip6s_clat464_out_success++;
7068 	}
7069 
7070 	return error;
7071 }
7072 
7073 /*
7074  * @brief This routine translates incoming IPv6 to IPv4 packet,
7075  *     updates protocol checksum and also translates ICMPv6 outer
7076  *     and inner headers
7077  *
7078  * @return 0 on success or else a negative value.
7079  */
7080 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7081 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7082 {
7083 	VERIFY(*proto_family == PF_INET6);
7084 	VERIFY(IS_INTF_CLAT46(ifp));
7085 
7086 	struct ip6_hdr *ip6h = NULL;
7087 	struct in6_addr osrc, odst;
7088 	uint8_t proto = 0;
7089 	struct in6_ifaddr *ia6_clat_dst = NULL;
7090 	struct in_ifaddr *ia4_clat_dst = NULL;
7091 	struct in_addr *dst = NULL;
7092 	struct in_addr src;
7093 	int error = 0;
7094 	uint32_t off = 0;
7095 	u_int64_t tot_len = 0;
7096 	uint8_t tos = 0;
7097 	boolean_t is_first_frag = TRUE;
7098 
7099 	/* Incoming mbuf does not contain valid IP6 header */
7100 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7101 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7102 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7103 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7104 		return -1;
7105 	}
7106 
7107 	ip6h = mtod(*m, struct ip6_hdr *);
7108 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7109 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7110 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7111 		return -1;
7112 	}
7113 
7114 	osrc = ip6h->ip6_src;
7115 	odst = ip6h->ip6_dst;
7116 
7117 	/*
7118 	 * Retrieve the local CLAT46 reserved IPv6 address.
7119 	 * Let the packet pass if we don't find one, as the flag
7120 	 * may get set before IPv6 configuration has taken place.
7121 	 */
7122 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7123 	if (ia6_clat_dst == NULL) {
7124 		goto done;
7125 	}
7126 
7127 	/*
7128 	 * Check if the original dest in the packet is same as the reserved
7129 	 * CLAT46 IPv6 address
7130 	 */
7131 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7132 		pbuf_t pbuf_store, *pbuf = NULL;
7133 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7134 		pbuf = &pbuf_store;
7135 
7136 		/*
7137 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7138 		 * translation.
7139 		 */
7140 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7141 		if (ia4_clat_dst == NULL) {
7142 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7143 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7144 			error = -1;
7145 			goto cleanup;
7146 		}
7147 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7148 
7149 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7150 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7151 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7152 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7153 			error = -1;
7154 			goto cleanup;
7155 		}
7156 
7157 		ip6h = pbuf->pb_data;
7158 		off = sizeof(struct ip6_hdr);
7159 		proto = ip6h->ip6_nxt;
7160 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7161 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7162 
7163 		/*
7164 		 * Translate the IP header and update the fragmentation
7165 		 * header if needed
7166 		 */
7167 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7168 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7169 		    0 : -1;
7170 
7171 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7172 
7173 		if (error != 0) {
7174 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7175 			goto cleanup;
7176 		}
7177 
7178 		/*
7179 		 * Translate protocol header, update checksum, checksum flags
7180 		 * and related fields.
7181 		 */
7182 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7183 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7184 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7185 
7186 		if (error != 0) {
7187 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7188 			goto cleanup;
7189 		}
7190 
7191 cleanup:
7192 		if (ia4_clat_dst != NULL) {
7193 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7194 		}
7195 
7196 		if (pbuf_is_valid(pbuf)) {
7197 			*m = pbuf->pb_mbuf;
7198 			pbuf->pb_mbuf = NULL;
7199 			pbuf_destroy(pbuf);
7200 		} else {
7201 			error = -1;
7202 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7203 		}
7204 
7205 		if (error == 0) {
7206 			*proto_family = PF_INET;
7207 			ip6stat.ip6s_clat464_in_success++;
7208 		}
7209 	} /* CLAT traffic */
7210 
7211 done:
7212 	return error;
7213 }
7214 
7215 /* The following is used to enqueue work items for ifnet ioctl events */
7216 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7217 
7218 struct ifnet_ioctl_event {
7219 	struct ifnet *ifp;
7220 	u_long ioctl_code;
7221 };
7222 
7223 struct ifnet_ioctl_event_nwk_wq_entry {
7224 	struct nwk_wq_entry nwk_wqe;
7225 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7226 };
7227 
7228 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7229 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7230 {
7231 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7232 
7233 	/*
7234 	 * Get an io ref count if the interface is attached.
7235 	 * At this point it most likely is. We are taking a reference for
7236 	 * deferred processing.
7237 	 */
7238 	if (!ifnet_is_attached(ifp, 1)) {
7239 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7240 		    "is not attached",
7241 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7242 		return;
7243 	}
7244 
7245 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7246 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7247 
7248 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7249 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7250 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7251 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7252 }
7253 
7254 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7255 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7256 {
7257 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7258 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7259 
7260 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7261 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7262 	int ret = 0;
7263 
7264 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7265 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7266 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7267 	} else if (dlil_verbose) {
7268 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7269 		    "for ioctl %lu",
7270 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7271 	}
7272 	ifnet_decr_iorefcnt(ifp);
7273 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7274 	return;
7275 }
7276 
7277 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7278 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7279     void *ioctl_arg)
7280 {
7281 	struct ifnet_filter *filter;
7282 	int retval = EOPNOTSUPP;
7283 	int result = 0;
7284 
7285 	if (ifp == NULL || ioctl_code == 0) {
7286 		return EINVAL;
7287 	}
7288 
7289 	/* Get an io ref count if the interface is attached */
7290 	if (!ifnet_is_attached(ifp, 1)) {
7291 		return EOPNOTSUPP;
7292 	}
7293 
7294 	/*
7295 	 * Run the interface filters first.
7296 	 * We want to run all filters before calling the protocol,
7297 	 * interface family, or interface.
7298 	 */
7299 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7300 	/* prevent filter list from changing in case we drop the lock */
7301 	if_flt_monitor_busy(ifp);
7302 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7303 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7304 		    filter->filt_protocol == proto_fam)) {
7305 			lck_mtx_unlock(&ifp->if_flt_lock);
7306 
7307 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7308 			    proto_fam, ioctl_code, ioctl_arg);
7309 
7310 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7311 
7312 			/* Only update retval if no one has handled the ioctl */
7313 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7314 				if (result == ENOTSUP) {
7315 					result = EOPNOTSUPP;
7316 				}
7317 				retval = result;
7318 				if (retval != 0 && retval != EOPNOTSUPP) {
7319 					/* we're done with the filter list */
7320 					if_flt_monitor_unbusy(ifp);
7321 					lck_mtx_unlock(&ifp->if_flt_lock);
7322 					goto cleanup;
7323 				}
7324 			}
7325 		}
7326 	}
7327 	/* we're done with the filter list */
7328 	if_flt_monitor_unbusy(ifp);
7329 	lck_mtx_unlock(&ifp->if_flt_lock);
7330 
7331 	/* Allow the protocol to handle the ioctl */
7332 	if (proto_fam != 0) {
7333 		struct if_proto *proto;
7334 
7335 		/* callee holds a proto refcnt upon success */
7336 		ifnet_lock_shared(ifp);
7337 		proto = find_attached_proto(ifp, proto_fam);
7338 		ifnet_lock_done(ifp);
7339 		if (proto != NULL) {
7340 			proto_media_ioctl ioctlp =
7341 			    (proto->proto_kpi == kProtoKPI_v1 ?
7342 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7343 			result = EOPNOTSUPP;
7344 			if (ioctlp != NULL) {
7345 				result = ioctlp(ifp, proto_fam, ioctl_code,
7346 				    ioctl_arg);
7347 			}
7348 			if_proto_free(proto);
7349 
7350 			/* Only update retval if no one has handled the ioctl */
7351 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7352 				if (result == ENOTSUP) {
7353 					result = EOPNOTSUPP;
7354 				}
7355 				retval = result;
7356 				if (retval && retval != EOPNOTSUPP) {
7357 					goto cleanup;
7358 				}
7359 			}
7360 		}
7361 	}
7362 
7363 	/* retval is either 0 or EOPNOTSUPP */
7364 
7365 	/*
7366 	 * Let the interface handle this ioctl.
7367 	 * If it returns EOPNOTSUPP, ignore that, we may have
7368 	 * already handled this in the protocol or family.
7369 	 */
7370 	if (ifp->if_ioctl) {
7371 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7372 	}
7373 
7374 	/* Only update retval if no one has handled the ioctl */
7375 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7376 		if (result == ENOTSUP) {
7377 			result = EOPNOTSUPP;
7378 		}
7379 		retval = result;
7380 		if (retval && retval != EOPNOTSUPP) {
7381 			goto cleanup;
7382 		}
7383 	}
7384 
7385 cleanup:
7386 	if (retval == EJUSTRETURN) {
7387 		retval = 0;
7388 	}
7389 
7390 	ifnet_decr_iorefcnt(ifp);
7391 
7392 	return retval;
7393 }
7394 
7395 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7396 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7397 {
7398 	errno_t error = 0;
7399 
7400 
7401 	if (ifp->if_set_bpf_tap) {
7402 		/* Get an io reference on the interface if it is attached */
7403 		if (!ifnet_is_attached(ifp, 1)) {
7404 			return ENXIO;
7405 		}
7406 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7407 		ifnet_decr_iorefcnt(ifp);
7408 	}
7409 	return error;
7410 }
7411 
7412 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7413 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7414     struct sockaddr *ll_addr, size_t ll_len)
7415 {
7416 	errno_t result = EOPNOTSUPP;
7417 	struct if_proto *proto;
7418 	const struct sockaddr *verify;
7419 	proto_media_resolve_multi resolvep;
7420 
7421 	if (!ifnet_is_attached(ifp, 1)) {
7422 		return result;
7423 	}
7424 
7425 	bzero(ll_addr, ll_len);
7426 
7427 	/* Call the protocol first; callee holds a proto refcnt upon success */
7428 	ifnet_lock_shared(ifp);
7429 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7430 	ifnet_lock_done(ifp);
7431 	if (proto != NULL) {
7432 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7433 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7434 		if (resolvep != NULL) {
7435 			result = resolvep(ifp, proto_addr,
7436 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7437 		}
7438 		if_proto_free(proto);
7439 	}
7440 
7441 	/* Let the interface verify the multicast address */
7442 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7443 		if (result == 0) {
7444 			verify = ll_addr;
7445 		} else {
7446 			verify = proto_addr;
7447 		}
7448 		result = ifp->if_check_multi(ifp, verify);
7449 	}
7450 
7451 	ifnet_decr_iorefcnt(ifp);
7452 	return result;
7453 }
7454 
7455 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7456 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7457     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7458     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7459 {
7460 	struct if_proto *proto;
7461 	errno_t result = 0;
7462 
7463 	/* callee holds a proto refcnt upon success */
7464 	ifnet_lock_shared(ifp);
7465 	proto = find_attached_proto(ifp, target_proto->sa_family);
7466 	ifnet_lock_done(ifp);
7467 	if (proto == NULL) {
7468 		result = ENOTSUP;
7469 	} else {
7470 		proto_media_send_arp    arpp;
7471 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7472 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7473 		if (arpp == NULL) {
7474 			result = ENOTSUP;
7475 		} else {
7476 			switch (arpop) {
7477 			case ARPOP_REQUEST:
7478 				arpstat.txrequests++;
7479 				if (target_hw != NULL) {
7480 					arpstat.txurequests++;
7481 				}
7482 				break;
7483 			case ARPOP_REPLY:
7484 				arpstat.txreplies++;
7485 				break;
7486 			}
7487 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7488 			    target_hw, target_proto);
7489 		}
7490 		if_proto_free(proto);
7491 	}
7492 
7493 	return result;
7494 }
7495 
7496 struct net_thread_marks { };
7497 static const struct net_thread_marks net_thread_marks_base = { };
7498 
7499 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7500     &net_thread_marks_base;
7501 
7502 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7503 net_thread_marks_push(u_int32_t push)
7504 {
7505 	static const char *const base = (const void*)&net_thread_marks_base;
7506 	u_int32_t pop = 0;
7507 
7508 	if (push != 0) {
7509 		struct uthread *uth = current_uthread();
7510 
7511 		pop = push & ~uth->uu_network_marks;
7512 		if (pop != 0) {
7513 			uth->uu_network_marks |= pop;
7514 		}
7515 	}
7516 
7517 	return (net_thread_marks_t)&base[pop];
7518 }
7519 
7520 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7521 net_thread_unmarks_push(u_int32_t unpush)
7522 {
7523 	static const char *const base = (const void*)&net_thread_marks_base;
7524 	u_int32_t unpop = 0;
7525 
7526 	if (unpush != 0) {
7527 		struct uthread *uth = current_uthread();
7528 
7529 		unpop = unpush & uth->uu_network_marks;
7530 		if (unpop != 0) {
7531 			uth->uu_network_marks &= ~unpop;
7532 		}
7533 	}
7534 
7535 	return (net_thread_marks_t)&base[unpop];
7536 }
7537 
7538 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7539 net_thread_marks_pop(net_thread_marks_t popx)
7540 {
7541 	static const char *const base = (const void*)&net_thread_marks_base;
7542 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7543 
7544 	if (pop != 0) {
7545 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7546 		struct uthread *uth = current_uthread();
7547 
7548 		VERIFY((pop & ones) == pop);
7549 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7550 		uth->uu_network_marks &= ~pop;
7551 	}
7552 }
7553 
7554 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7555 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7556 {
7557 	static const char *const base = (const void*)&net_thread_marks_base;
7558 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7559 
7560 	if (unpop != 0) {
7561 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7562 		struct uthread *uth = current_uthread();
7563 
7564 		VERIFY((unpop & ones) == unpop);
7565 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7566 		uth->uu_network_marks |= unpop;
7567 	}
7568 }
7569 
7570 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7571 net_thread_is_marked(u_int32_t check)
7572 {
7573 	if (check != 0) {
7574 		struct uthread *uth = current_uthread();
7575 		return uth->uu_network_marks & check;
7576 	} else {
7577 		return 0;
7578 	}
7579 }
7580 
7581 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7582 net_thread_is_unmarked(u_int32_t check)
7583 {
7584 	if (check != 0) {
7585 		struct uthread *uth = current_uthread();
7586 		return ~uth->uu_network_marks & check;
7587 	} else {
7588 		return 0;
7589 	}
7590 }
7591 
7592 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7593 _is_announcement(const struct sockaddr_in * sender_sin,
7594     const struct sockaddr_in * target_sin)
7595 {
7596 	if (target_sin == NULL || sender_sin == NULL) {
7597 		return FALSE;
7598 	}
7599 
7600 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7601 }
7602 
7603 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7604 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7605     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7606     const struct sockaddr *target_proto0, u_int32_t rtflags)
7607 {
7608 	errno_t result = 0;
7609 	const struct sockaddr_in * sender_sin;
7610 	const struct sockaddr_in * target_sin;
7611 	struct sockaddr_inarp target_proto_sinarp;
7612 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7613 
7614 	if (target_proto == NULL || sender_proto == NULL) {
7615 		return EINVAL;
7616 	}
7617 
7618 	if (sender_proto->sa_family != target_proto->sa_family) {
7619 		return EINVAL;
7620 	}
7621 
7622 	/*
7623 	 * If the target is a (default) router, provide that
7624 	 * information to the send_arp callback routine.
7625 	 */
7626 	if (rtflags & RTF_ROUTER) {
7627 		bcopy(target_proto, &target_proto_sinarp,
7628 		    sizeof(struct sockaddr_in));
7629 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7630 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7631 	}
7632 
7633 	/*
7634 	 * If this is an ARP request and the target IP is IPv4LL,
7635 	 * send the request on all interfaces.  The exception is
7636 	 * an announcement, which must only appear on the specific
7637 	 * interface.
7638 	 */
7639 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7640 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7641 	if (target_proto->sa_family == AF_INET &&
7642 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7643 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7644 	    !_is_announcement(sender_sin, target_sin)) {
7645 		ifnet_t         *ifp_list;
7646 		u_int32_t       count;
7647 		u_int32_t       ifp_on;
7648 
7649 		result = ENOTSUP;
7650 
7651 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7652 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7653 				errno_t new_result;
7654 				ifaddr_t source_hw = NULL;
7655 				ifaddr_t source_ip = NULL;
7656 				struct sockaddr_in source_ip_copy;
7657 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7658 
7659 				/*
7660 				 * Only arp on interfaces marked for IPv4LL
7661 				 * ARPing.  This may mean that we don't ARP on
7662 				 * the interface the subnet route points to.
7663 				 */
7664 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7665 					continue;
7666 				}
7667 
7668 				/* Find the source IP address */
7669 				ifnet_lock_shared(cur_ifp);
7670 				source_hw = cur_ifp->if_lladdr;
7671 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7672 				    ifa_link) {
7673 					IFA_LOCK(source_ip);
7674 					if (source_ip->ifa_addr != NULL &&
7675 					    source_ip->ifa_addr->sa_family ==
7676 					    AF_INET) {
7677 						/* Copy the source IP address */
7678 						source_ip_copy =
7679 						    *(struct sockaddr_in *)
7680 						    (void *)source_ip->ifa_addr;
7681 						IFA_UNLOCK(source_ip);
7682 						break;
7683 					}
7684 					IFA_UNLOCK(source_ip);
7685 				}
7686 
7687 				/* No IP Source, don't arp */
7688 				if (source_ip == NULL) {
7689 					ifnet_lock_done(cur_ifp);
7690 					continue;
7691 				}
7692 
7693 				IFA_ADDREF(source_hw);
7694 				ifnet_lock_done(cur_ifp);
7695 
7696 				/* Send the ARP */
7697 				new_result = dlil_send_arp_internal(cur_ifp,
7698 				    arpop, (struct sockaddr_dl *)(void *)
7699 				    source_hw->ifa_addr,
7700 				    (struct sockaddr *)&source_ip_copy, NULL,
7701 				    target_proto);
7702 
7703 				IFA_REMREF(source_hw);
7704 				if (result == ENOTSUP) {
7705 					result = new_result;
7706 				}
7707 			}
7708 			ifnet_list_free(ifp_list);
7709 		}
7710 	} else {
7711 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7712 		    sender_proto, target_hw, target_proto);
7713 	}
7714 
7715 	return result;
7716 }
7717 
7718 /*
7719  * Caller must hold ifnet head lock.
7720  */
7721 static int
ifnet_lookup(struct ifnet * ifp)7722 ifnet_lookup(struct ifnet *ifp)
7723 {
7724 	struct ifnet *_ifp;
7725 
7726 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7727 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7728 		if (_ifp == ifp) {
7729 			break;
7730 		}
7731 	}
7732 	return _ifp != NULL;
7733 }
7734 
7735 /*
7736  * Caller has to pass a non-zero refio argument to get a
7737  * IO reference count. This will prevent ifnet_detach from
7738  * being called when there are outstanding io reference counts.
7739  */
7740 int
ifnet_is_attached(struct ifnet * ifp,int refio)7741 ifnet_is_attached(struct ifnet *ifp, int refio)
7742 {
7743 	int ret;
7744 
7745 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7746 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7747 		if (refio > 0) {
7748 			ifp->if_refio++;
7749 		}
7750 	}
7751 	lck_mtx_unlock(&ifp->if_ref_lock);
7752 
7753 	return ret;
7754 }
7755 
7756 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7757 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7758 {
7759 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7760 	ifp->if_threads_pending++;
7761 	lck_mtx_unlock(&ifp->if_ref_lock);
7762 }
7763 
7764 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7765 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7766 {
7767 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7768 	VERIFY(ifp->if_threads_pending > 0);
7769 	ifp->if_threads_pending--;
7770 	if (ifp->if_threads_pending == 0) {
7771 		wakeup(&ifp->if_threads_pending);
7772 	}
7773 	lck_mtx_unlock(&ifp->if_ref_lock);
7774 }
7775 
7776 /*
7777  * Caller must ensure the interface is attached; the assumption is that
7778  * there is at least an outstanding IO reference count held already.
7779  * Most callers would call ifnet_is_{attached,data_ready}() instead.
7780  */
7781 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7782 ifnet_incr_iorefcnt(struct ifnet *ifp)
7783 {
7784 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7785 	VERIFY(IF_FULLY_ATTACHED(ifp));
7786 	VERIFY(ifp->if_refio > 0);
7787 	ifp->if_refio++;
7788 	lck_mtx_unlock(&ifp->if_ref_lock);
7789 }
7790 
7791 __attribute__((always_inline))
7792 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7793 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7794 {
7795 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7796 
7797 	VERIFY(ifp->if_refio > 0);
7798 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7799 
7800 	ifp->if_refio--;
7801 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7802 
7803 	/*
7804 	 * if there are no more outstanding io references, wakeup the
7805 	 * ifnet_detach thread if detaching flag is set.
7806 	 */
7807 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7808 		wakeup(&(ifp->if_refio));
7809 	}
7810 }
7811 
7812 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7813 ifnet_decr_iorefcnt(struct ifnet *ifp)
7814 {
7815 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7816 	ifnet_decr_iorefcnt_locked(ifp);
7817 	lck_mtx_unlock(&ifp->if_ref_lock);
7818 }
7819 
7820 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)7821 ifnet_datamov_begin(struct ifnet *ifp)
7822 {
7823 	boolean_t ret;
7824 
7825 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7826 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
7827 		ifp->if_refio++;
7828 		ifp->if_datamov++;
7829 	}
7830 	lck_mtx_unlock(&ifp->if_ref_lock);
7831 
7832 	return ret;
7833 }
7834 
7835 void
ifnet_datamov_end(struct ifnet * ifp)7836 ifnet_datamov_end(struct ifnet *ifp)
7837 {
7838 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7839 	VERIFY(ifp->if_datamov > 0);
7840 	/*
7841 	 * if there's no more thread moving data, wakeup any
7842 	 * drainers that's blocked waiting for this.
7843 	 */
7844 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
7845 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
7846 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
7847 		wakeup(&(ifp->if_datamov));
7848 	}
7849 	ifnet_decr_iorefcnt_locked(ifp);
7850 	lck_mtx_unlock(&ifp->if_ref_lock);
7851 }
7852 
7853 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)7854 ifnet_datamov_suspend_locked(struct ifnet *ifp)
7855 {
7856 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7857 	ifp->if_refio++;
7858 	if (ifp->if_suspend++ == 0) {
7859 		VERIFY(ifp->if_refflags & IFRF_READY);
7860 		ifp->if_refflags &= ~IFRF_READY;
7861 	}
7862 }
7863 
7864 void
ifnet_datamov_suspend(struct ifnet * ifp)7865 ifnet_datamov_suspend(struct ifnet *ifp)
7866 {
7867 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7868 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7869 	ifnet_datamov_suspend_locked(ifp);
7870 	lck_mtx_unlock(&ifp->if_ref_lock);
7871 }
7872 
7873 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)7874 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
7875 {
7876 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7877 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7878 	if (ifp->if_suspend > 0) {
7879 		lck_mtx_unlock(&ifp->if_ref_lock);
7880 		return FALSE;
7881 	}
7882 	ifnet_datamov_suspend_locked(ifp);
7883 	lck_mtx_unlock(&ifp->if_ref_lock);
7884 	return TRUE;
7885 }
7886 
7887 void
ifnet_datamov_drain(struct ifnet * ifp)7888 ifnet_datamov_drain(struct ifnet *ifp)
7889 {
7890 	lck_mtx_lock(&ifp->if_ref_lock);
7891 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7892 	/* data movement must already be suspended */
7893 	VERIFY(ifp->if_suspend > 0);
7894 	VERIFY(!(ifp->if_refflags & IFRF_READY));
7895 	ifp->if_drainers++;
7896 	while (ifp->if_datamov != 0) {
7897 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
7898 		    if_name(ifp));
7899 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
7900 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
7901 		    (PZERO - 1), __func__, NULL);
7902 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
7903 	}
7904 	VERIFY(!(ifp->if_refflags & IFRF_READY));
7905 	VERIFY(ifp->if_drainers > 0);
7906 	ifp->if_drainers--;
7907 	lck_mtx_unlock(&ifp->if_ref_lock);
7908 
7909 	/* purge the interface queues */
7910 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
7911 		if_qflush_snd(ifp, false);
7912 	}
7913 }
7914 
7915 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)7916 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
7917 {
7918 	ifnet_datamov_suspend(ifp);
7919 	ifnet_datamov_drain(ifp);
7920 }
7921 
7922 void
ifnet_datamov_resume(struct ifnet * ifp)7923 ifnet_datamov_resume(struct ifnet *ifp)
7924 {
7925 	lck_mtx_lock(&ifp->if_ref_lock);
7926 	/* data movement must already be suspended */
7927 	VERIFY(ifp->if_suspend > 0);
7928 	if (--ifp->if_suspend == 0) {
7929 		VERIFY(!(ifp->if_refflags & IFRF_READY));
7930 		ifp->if_refflags |= IFRF_READY;
7931 	}
7932 	ifnet_decr_iorefcnt_locked(ifp);
7933 	lck_mtx_unlock(&ifp->if_ref_lock);
7934 }
7935 
7936 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)7937 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
7938 {
7939 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
7940 	ctrace_t *tr;
7941 	u_int32_t idx;
7942 	u_int16_t *cnt;
7943 
7944 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
7945 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
7946 		/* NOTREACHED */
7947 	}
7948 
7949 	if (refhold) {
7950 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
7951 		tr = dl_if_dbg->dldbg_if_refhold;
7952 	} else {
7953 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
7954 		tr = dl_if_dbg->dldbg_if_refrele;
7955 	}
7956 
7957 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
7958 	ctrace_record(&tr[idx]);
7959 }
7960 
7961 errno_t
dlil_if_ref(struct ifnet * ifp)7962 dlil_if_ref(struct ifnet *ifp)
7963 {
7964 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7965 
7966 	if (dl_if == NULL) {
7967 		return EINVAL;
7968 	}
7969 
7970 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
7971 	++dl_if->dl_if_refcnt;
7972 	if (dl_if->dl_if_refcnt == 0) {
7973 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
7974 		/* NOTREACHED */
7975 	}
7976 	if (dl_if->dl_if_trace != NULL) {
7977 		(*dl_if->dl_if_trace)(dl_if, TRUE);
7978 	}
7979 	lck_mtx_unlock(&dl_if->dl_if_lock);
7980 
7981 	return 0;
7982 }
7983 
7984 errno_t
dlil_if_free(struct ifnet * ifp)7985 dlil_if_free(struct ifnet *ifp)
7986 {
7987 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7988 	bool need_release = FALSE;
7989 
7990 	if (dl_if == NULL) {
7991 		return EINVAL;
7992 	}
7993 
7994 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
7995 	switch (dl_if->dl_if_refcnt) {
7996 	case 0:
7997 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
7998 		/* NOTREACHED */
7999 		break;
8000 	case 1:
8001 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8002 			need_release = TRUE;
8003 		}
8004 		break;
8005 	default:
8006 		break;
8007 	}
8008 	--dl_if->dl_if_refcnt;
8009 	if (dl_if->dl_if_trace != NULL) {
8010 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8011 	}
8012 	lck_mtx_unlock(&dl_if->dl_if_lock);
8013 	if (need_release) {
8014 		_dlil_if_release(ifp, true);
8015 	}
8016 	return 0;
8017 }
8018 
8019 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8020 dlil_attach_protocol(struct if_proto *proto,
8021     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8022     uint32_t * proto_count)
8023 {
8024 	struct kev_dl_proto_data ev_pr_data;
8025 	struct ifnet *ifp = proto->ifp;
8026 	errno_t retval = 0;
8027 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8028 	struct if_proto *prev_proto;
8029 	struct if_proto *_proto;
8030 
8031 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8032 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8033 		return EINVAL;
8034 	}
8035 
8036 	if (!ifnet_is_attached(ifp, 1)) {
8037 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8038 		    __func__, if_name(ifp));
8039 		return ENXIO;
8040 	}
8041 	/* callee holds a proto refcnt upon success */
8042 	ifnet_lock_exclusive(ifp);
8043 	_proto = find_attached_proto(ifp, proto->protocol_family);
8044 	if (_proto != NULL) {
8045 		ifnet_lock_done(ifp);
8046 		if_proto_free(_proto);
8047 		retval = EEXIST;
8048 		goto ioref_done;
8049 	}
8050 
8051 	/*
8052 	 * Call family module add_proto routine so it can refine the
8053 	 * demux descriptors as it wishes.
8054 	 */
8055 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8056 	    demux_count);
8057 	if (retval) {
8058 		ifnet_lock_done(ifp);
8059 		goto ioref_done;
8060 	}
8061 
8062 	/*
8063 	 * Insert the protocol in the hash
8064 	 */
8065 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8066 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8067 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8068 	}
8069 	if (prev_proto) {
8070 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8071 	} else {
8072 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8073 		    proto, next_hash);
8074 	}
8075 
8076 	/* hold a proto refcnt for attach */
8077 	if_proto_ref(proto);
8078 
8079 	/*
8080 	 * The reserved field carries the number of protocol still attached
8081 	 * (subject to change)
8082 	 */
8083 	ev_pr_data.proto_family = proto->protocol_family;
8084 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8085 
8086 	ifnet_lock_done(ifp);
8087 
8088 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8089 	    (struct net_event_data *)&ev_pr_data,
8090 	    sizeof(struct kev_dl_proto_data), FALSE);
8091 	if (proto_count != NULL) {
8092 		*proto_count = ev_pr_data.proto_remaining_count;
8093 	}
8094 ioref_done:
8095 	ifnet_decr_iorefcnt(ifp);
8096 	return retval;
8097 }
8098 
8099 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8100 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8101 {
8102 	/*
8103 	 * A protocol has been attached, mark the interface up.
8104 	 * This used to be done by configd.KernelEventMonitor, but that
8105 	 * is inherently prone to races (rdar://problem/30810208).
8106 	 */
8107 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8108 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8109 	dlil_post_sifflags_msg(ifp);
8110 #if SKYWALK
8111 	switch (protocol) {
8112 	case AF_INET:
8113 	case AF_INET6:
8114 		/* don't attach the flowswitch unless attaching IP */
8115 		dlil_attach_flowswitch_nexus(ifp);
8116 		break;
8117 	default:
8118 		break;
8119 	}
8120 #endif /* SKYWALK */
8121 }
8122 
8123 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8124 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8125     const struct ifnet_attach_proto_param *proto_details)
8126 {
8127 	int retval = 0;
8128 	struct if_proto  *ifproto = NULL;
8129 	uint32_t proto_count = 0;
8130 
8131 	ifnet_head_lock_shared();
8132 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8133 		retval = EINVAL;
8134 		goto end;
8135 	}
8136 	/* Check that the interface is in the global list */
8137 	if (!ifnet_lookup(ifp)) {
8138 		retval = ENXIO;
8139 		goto end;
8140 	}
8141 
8142 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8143 
8144 	/* refcnt held above during lookup */
8145 	ifproto->ifp = ifp;
8146 	ifproto->protocol_family = protocol;
8147 	ifproto->proto_kpi = kProtoKPI_v1;
8148 	ifproto->kpi.v1.input = proto_details->input;
8149 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8150 	ifproto->kpi.v1.event = proto_details->event;
8151 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8152 	ifproto->kpi.v1.detached = proto_details->detached;
8153 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8154 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8155 
8156 	retval = dlil_attach_protocol(ifproto,
8157 	    proto_details->demux_list, proto_details->demux_count,
8158 	    &proto_count);
8159 
8160 end:
8161 	if (retval == EEXIST) {
8162 		/* already attached */
8163 		if (dlil_verbose) {
8164 			DLIL_PRINTF("%s: protocol %d already attached\n",
8165 			    ifp != NULL ? if_name(ifp) : "N/A",
8166 			    protocol);
8167 		}
8168 	} else if (retval != 0) {
8169 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8170 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8171 	} else if (dlil_verbose) {
8172 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8173 		    ifp != NULL ? if_name(ifp) : "N/A",
8174 		    protocol, proto_count);
8175 	}
8176 	ifnet_head_done();
8177 	if (retval == 0) {
8178 		dlil_handle_proto_attach(ifp, protocol);
8179 	} else if (ifproto != NULL) {
8180 		zfree(dlif_proto_zone, ifproto);
8181 	}
8182 	return retval;
8183 }
8184 
8185 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8186 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8187     const struct ifnet_attach_proto_param_v2 *proto_details)
8188 {
8189 	int retval = 0;
8190 	struct if_proto  *ifproto = NULL;
8191 	uint32_t proto_count = 0;
8192 
8193 	ifnet_head_lock_shared();
8194 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8195 		retval = EINVAL;
8196 		goto end;
8197 	}
8198 	/* Check that the interface is in the global list */
8199 	if (!ifnet_lookup(ifp)) {
8200 		retval = ENXIO;
8201 		goto end;
8202 	}
8203 
8204 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8205 
8206 	/* refcnt held above during lookup */
8207 	ifproto->ifp = ifp;
8208 	ifproto->protocol_family = protocol;
8209 	ifproto->proto_kpi = kProtoKPI_v2;
8210 	ifproto->kpi.v2.input = proto_details->input;
8211 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8212 	ifproto->kpi.v2.event = proto_details->event;
8213 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8214 	ifproto->kpi.v2.detached = proto_details->detached;
8215 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8216 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8217 
8218 	retval = dlil_attach_protocol(ifproto,
8219 	    proto_details->demux_list, proto_details->demux_count,
8220 	    &proto_count);
8221 
8222 end:
8223 	if (retval == EEXIST) {
8224 		/* already attached */
8225 		if (dlil_verbose) {
8226 			DLIL_PRINTF("%s: protocol %d already attached\n",
8227 			    ifp != NULL ? if_name(ifp) : "N/A",
8228 			    protocol);
8229 		}
8230 	} else if (retval != 0) {
8231 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8232 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8233 	} else if (dlil_verbose) {
8234 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8235 		    ifp != NULL ? if_name(ifp) : "N/A",
8236 		    protocol, proto_count);
8237 	}
8238 	ifnet_head_done();
8239 	if (retval == 0) {
8240 		dlil_handle_proto_attach(ifp, protocol);
8241 	} else if (ifproto != NULL) {
8242 		zfree(dlif_proto_zone, ifproto);
8243 	}
8244 	return retval;
8245 }
8246 
8247 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8248 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8249 {
8250 	struct if_proto *proto = NULL;
8251 	int     retval = 0;
8252 
8253 	if (ifp == NULL || proto_family == 0) {
8254 		retval = EINVAL;
8255 		goto end;
8256 	}
8257 
8258 	ifnet_lock_exclusive(ifp);
8259 	/* callee holds a proto refcnt upon success */
8260 	proto = find_attached_proto(ifp, proto_family);
8261 	if (proto == NULL) {
8262 		retval = ENXIO;
8263 		ifnet_lock_done(ifp);
8264 		goto end;
8265 	}
8266 
8267 	/* call family module del_proto */
8268 	if (ifp->if_del_proto) {
8269 		ifp->if_del_proto(ifp, proto->protocol_family);
8270 	}
8271 
8272 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8273 	    proto, if_proto, next_hash);
8274 
8275 	if (proto->proto_kpi == kProtoKPI_v1) {
8276 		proto->kpi.v1.input = ifproto_media_input_v1;
8277 		proto->kpi.v1.pre_output = ifproto_media_preout;
8278 		proto->kpi.v1.event = ifproto_media_event;
8279 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8280 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8281 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8282 	} else {
8283 		proto->kpi.v2.input = ifproto_media_input_v2;
8284 		proto->kpi.v2.pre_output = ifproto_media_preout;
8285 		proto->kpi.v2.event = ifproto_media_event;
8286 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8287 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8288 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8289 	}
8290 	proto->detached = 1;
8291 	ifnet_lock_done(ifp);
8292 
8293 	if (dlil_verbose) {
8294 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8295 		    (proto->proto_kpi == kProtoKPI_v1) ?
8296 		    "v1" : "v2", proto_family);
8297 	}
8298 
8299 	/* release proto refcnt held during protocol attach */
8300 	if_proto_free(proto);
8301 
8302 	/*
8303 	 * Release proto refcnt held during lookup; the rest of
8304 	 * protocol detach steps will happen when the last proto
8305 	 * reference is released.
8306 	 */
8307 	if_proto_free(proto);
8308 
8309 end:
8310 	return retval;
8311 }
8312 
8313 
8314 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8315 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8316     struct mbuf *packet, char *header)
8317 {
8318 #pragma unused(ifp, protocol, packet, header)
8319 	return ENXIO;
8320 }
8321 
8322 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8323 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8324     struct mbuf *packet)
8325 {
8326 #pragma unused(ifp, protocol, packet)
8327 	return ENXIO;
8328 }
8329 
8330 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8331 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8332     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8333     char *link_layer_dest)
8334 {
8335 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8336 	return ENXIO;
8337 }
8338 
8339 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8340 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8341     const struct kev_msg *event)
8342 {
8343 #pragma unused(ifp, protocol, event)
8344 }
8345 
8346 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8347 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8348     unsigned long command, void *argument)
8349 {
8350 #pragma unused(ifp, protocol, command, argument)
8351 	return ENXIO;
8352 }
8353 
8354 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8355 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8356     struct sockaddr_dl *out_ll, size_t ll_len)
8357 {
8358 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8359 	return ENXIO;
8360 }
8361 
8362 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8363 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8364     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8365     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8366 {
8367 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8368 	return ENXIO;
8369 }
8370 
8371 extern int if_next_index(void);
8372 extern int tcp_ecn_outbound;
8373 
8374 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8375 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8376 {
8377 	uint32_t sflags = 0;
8378 	int err;
8379 
8380 	if (if_flowadv) {
8381 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8382 	}
8383 
8384 	if (if_delaybased_queue) {
8385 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8386 	}
8387 
8388 	if (ifp->if_output_sched_model ==
8389 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8390 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8391 	}
8392 	/* Inherit drop limit from the default queue */
8393 	if (ifp->if_snd != ifcq) {
8394 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8395 	}
8396 	/* Initialize transmit queue(s) */
8397 	err = ifclassq_setup(ifcq, ifp, sflags);
8398 	if (err != 0) {
8399 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8400 		    "err=%d", __func__, ifp, err);
8401 		/* NOTREACHED */
8402 	}
8403 }
8404 
8405 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8406 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8407 {
8408 #if SKYWALK
8409 	boolean_t netif_compat;
8410 	if_nexus_netif  nexus_netif;
8411 #endif /* SKYWALK */
8412 	struct ifnet *tmp_if;
8413 	struct ifaddr *ifa;
8414 	struct if_data_internal if_data_saved;
8415 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8416 	struct dlil_threading_info *dl_inp;
8417 	thread_continue_t thfunc = NULL;
8418 	int err;
8419 
8420 	if (ifp == NULL) {
8421 		return EINVAL;
8422 	}
8423 
8424 	/*
8425 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8426 	 * prevent the interface from being configured while it is
8427 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8428 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8429 	 */
8430 	dlil_if_lock();
8431 	ifnet_head_lock_exclusive();
8432 	/* Verify we aren't already on the list */
8433 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8434 		if (tmp_if == ifp) {
8435 			ifnet_head_done();
8436 			dlil_if_unlock();
8437 			return EEXIST;
8438 		}
8439 	}
8440 
8441 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8442 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8443 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8444 		    __func__, ifp);
8445 		/* NOTREACHED */
8446 	}
8447 	lck_mtx_unlock(&ifp->if_ref_lock);
8448 
8449 	ifnet_lock_exclusive(ifp);
8450 
8451 	/* Sanity check */
8452 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8453 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8454 	VERIFY(ifp->if_threads_pending == 0);
8455 
8456 	if (ll_addr != NULL) {
8457 		if (ifp->if_addrlen == 0) {
8458 			ifp->if_addrlen = ll_addr->sdl_alen;
8459 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8460 			ifnet_lock_done(ifp);
8461 			ifnet_head_done();
8462 			dlil_if_unlock();
8463 			return EINVAL;
8464 		}
8465 	}
8466 
8467 	/*
8468 	 * Allow interfaces without protocol families to attach
8469 	 * only if they have the necessary fields filled out.
8470 	 */
8471 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8472 		DLIL_PRINTF("%s: Attempt to attach interface without "
8473 		    "family module - %d\n", __func__, ifp->if_family);
8474 		ifnet_lock_done(ifp);
8475 		ifnet_head_done();
8476 		dlil_if_unlock();
8477 		return ENODEV;
8478 	}
8479 
8480 	/* Allocate protocol hash table */
8481 	VERIFY(ifp->if_proto_hash == NULL);
8482 	ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8483 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
8484 
8485 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8486 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8487 	TAILQ_INIT(&ifp->if_flt_head);
8488 	VERIFY(ifp->if_flt_busy == 0);
8489 	VERIFY(ifp->if_flt_waiters == 0);
8490 	VERIFY(ifp->if_flt_non_os_count == 0);
8491 	VERIFY(ifp->if_flt_no_tso_count == 0);
8492 	lck_mtx_unlock(&ifp->if_flt_lock);
8493 
8494 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8495 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8496 		LIST_INIT(&ifp->if_multiaddrs);
8497 	}
8498 
8499 	VERIFY(ifp->if_allhostsinm == NULL);
8500 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8501 	TAILQ_INIT(&ifp->if_addrhead);
8502 
8503 	if (ifp->if_index == 0) {
8504 		int idx = if_next_index();
8505 
8506 		if (idx == -1) {
8507 			ifp->if_index = 0;
8508 			ifnet_lock_done(ifp);
8509 			ifnet_head_done();
8510 			dlil_if_unlock();
8511 			return ENOBUFS;
8512 		}
8513 		ifp->if_index = (uint16_t)idx;
8514 
8515 		/* the lladdr passed at attach time is the permanent address */
8516 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8517 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8518 			bcopy(CONST_LLADDR(ll_addr),
8519 			    dl_if->dl_if_permanent_ether,
8520 			    ETHER_ADDR_LEN);
8521 			dl_if->dl_if_permanent_ether_is_set = 1;
8522 		}
8523 	}
8524 	/* There should not be anything occupying this slot */
8525 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8526 
8527 	/* allocate (if needed) and initialize a link address */
8528 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8529 	if (ifa == NULL) {
8530 		ifnet_lock_done(ifp);
8531 		ifnet_head_done();
8532 		dlil_if_unlock();
8533 		return ENOBUFS;
8534 	}
8535 
8536 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8537 	ifnet_addrs[ifp->if_index - 1] = ifa;
8538 
8539 	/* make this address the first on the list */
8540 	IFA_LOCK(ifa);
8541 	/* hold a reference for ifnet_addrs[] */
8542 	IFA_ADDREF_LOCKED(ifa);
8543 	/* if_attach_link_ifa() holds a reference for ifa_link */
8544 	if_attach_link_ifa(ifp, ifa);
8545 	IFA_UNLOCK(ifa);
8546 
8547 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8548 	ifindex2ifnet[ifp->if_index] = ifp;
8549 
8550 	/* Hold a reference to the underlying dlil_ifnet */
8551 	ifnet_reference(ifp);
8552 
8553 	/* Clear stats (save and restore other fields that we care) */
8554 	if_data_saved = ifp->if_data;
8555 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8556 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8557 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8558 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8559 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8560 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8561 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8562 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8563 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8564 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8565 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8566 	ifnet_touch_lastchange(ifp);
8567 
8568 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8569 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8570 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8571 
8572 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8573 
8574 	/* Sanity checks on the input thread storage */
8575 	dl_inp = &dl_if->dl_if_inpstorage;
8576 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8577 	VERIFY(dl_inp->dlth_flags == 0);
8578 	VERIFY(dl_inp->dlth_wtot == 0);
8579 	VERIFY(dl_inp->dlth_ifp == NULL);
8580 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8581 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8582 	VERIFY(!dl_inp->dlth_affinity);
8583 	VERIFY(ifp->if_inp == NULL);
8584 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8585 	VERIFY(dl_inp->dlth_strategy == NULL);
8586 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8587 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8588 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8589 
8590 #if IFNET_INPUT_SANITY_CHK
8591 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8592 #endif /* IFNET_INPUT_SANITY_CHK */
8593 
8594 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8595 	dlil_reset_rxpoll_params(ifp);
8596 	/*
8597 	 * A specific DLIL input thread is created per non-loopback interface.
8598 	 */
8599 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8600 		ifp->if_inp = dl_inp;
8601 		ifnet_incr_pending_thread_count(ifp);
8602 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8603 		if (err == ENODEV) {
8604 			VERIFY(thfunc == NULL);
8605 			ifnet_decr_pending_thread_count(ifp);
8606 		} else if (err != 0) {
8607 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8608 			    "err=%d", __func__, ifp, err);
8609 			/* NOTREACHED */
8610 		}
8611 	}
8612 	/*
8613 	 * If the driver supports the new transmit model, calculate flow hash
8614 	 * and create a workloop starter thread to invoke the if_start callback
8615 	 * where the packets may be dequeued and transmitted.
8616 	 */
8617 	if (ifp->if_eflags & IFEF_TXSTART) {
8618 		thread_precedence_policy_data_t info;
8619 		__unused kern_return_t kret;
8620 
8621 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8622 		VERIFY(ifp->if_flowhash != 0);
8623 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8624 
8625 		ifnet_set_start_cycle(ifp, NULL);
8626 		ifp->if_start_active = 0;
8627 		ifp->if_start_req = 0;
8628 		ifp->if_start_flags = 0;
8629 		VERIFY(ifp->if_start != NULL);
8630 		ifnet_incr_pending_thread_count(ifp);
8631 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8632 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8633 			panic_plain("%s: "
8634 			    "ifp=%p couldn't get a start thread; "
8635 			    "err=%d", __func__, ifp, err);
8636 			/* NOTREACHED */
8637 		}
8638 		bzero(&info, sizeof(info));
8639 		info.importance = 1;
8640 		kret = thread_policy_set(ifp->if_start_thread,
8641 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8642 		    THREAD_PRECEDENCE_POLICY_COUNT);
8643 		ASSERT(kret == KERN_SUCCESS);
8644 	} else {
8645 		ifp->if_flowhash = 0;
8646 	}
8647 
8648 	/* Reset polling parameters */
8649 	ifnet_set_poll_cycle(ifp, NULL);
8650 	ifp->if_poll_update = 0;
8651 	ifp->if_poll_flags = 0;
8652 	ifp->if_poll_req = 0;
8653 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8654 
8655 	/*
8656 	 * If the driver supports the new receive model, create a poller
8657 	 * thread to invoke if_input_poll callback where the packets may
8658 	 * be dequeued from the driver and processed for reception.
8659 	 * if the interface is netif compat then the poller thread is
8660 	 * managed by netif.
8661 	 */
8662 	if (thfunc == dlil_rxpoll_input_thread_func) {
8663 		thread_precedence_policy_data_t info;
8664 		__unused kern_return_t kret;
8665 #if SKYWALK
8666 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8667 #endif /* SKYWALK */
8668 		VERIFY(ifp->if_input_poll != NULL);
8669 		VERIFY(ifp->if_input_ctl != NULL);
8670 		ifnet_incr_pending_thread_count(ifp);
8671 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8672 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8673 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8674 			    "err=%d", __func__, ifp, err);
8675 			/* NOTREACHED */
8676 		}
8677 		bzero(&info, sizeof(info));
8678 		info.importance = 1;
8679 		kret = thread_policy_set(ifp->if_poll_thread,
8680 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8681 		    THREAD_PRECEDENCE_POLICY_COUNT);
8682 		ASSERT(kret == KERN_SUCCESS);
8683 	}
8684 
8685 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8686 	VERIFY(ifp->if_desc.ifd_len == 0);
8687 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8688 
8689 	/* Record attach PC stacktrace */
8690 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8691 
8692 	ifp->if_updatemcasts = 0;
8693 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8694 		struct ifmultiaddr *ifma;
8695 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8696 			IFMA_LOCK(ifma);
8697 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8698 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8699 				ifp->if_updatemcasts++;
8700 			}
8701 			IFMA_UNLOCK(ifma);
8702 		}
8703 
8704 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8705 		    "membership(s)\n", if_name(ifp),
8706 		    ifp->if_updatemcasts);
8707 	}
8708 
8709 	/* Clear logging parameters */
8710 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8711 
8712 	/* Clear foreground/realtime activity timestamps */
8713 	ifp->if_fg_sendts = 0;
8714 	ifp->if_rt_sendts = 0;
8715 
8716 	/* Clear throughput estimates and radio type */
8717 	ifp->if_estimated_up_bucket = 0;
8718 	ifp->if_estimated_down_bucket = 0;
8719 	ifp->if_radio_type = 0;
8720 	ifp->if_radio_channel = 0;
8721 
8722 	VERIFY(ifp->if_delegated.ifp == NULL);
8723 	VERIFY(ifp->if_delegated.type == 0);
8724 	VERIFY(ifp->if_delegated.family == 0);
8725 	VERIFY(ifp->if_delegated.subfamily == 0);
8726 	VERIFY(ifp->if_delegated.expensive == 0);
8727 	VERIFY(ifp->if_delegated.constrained == 0);
8728 
8729 	VERIFY(ifp->if_agentids == NULL);
8730 	VERIFY(ifp->if_agentcount == 0);
8731 
8732 	/* Reset interface state */
8733 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8734 	ifp->if_interface_state.valid_bitmask |=
8735 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8736 	ifp->if_interface_state.interface_availability =
8737 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8738 
8739 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8740 	if (ifp == lo_ifp) {
8741 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8742 		ifp->if_interface_state.valid_bitmask |=
8743 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8744 	} else {
8745 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8746 	}
8747 
8748 	/*
8749 	 * Enable ECN capability on this interface depending on the
8750 	 * value of ECN global setting
8751 	 */
8752 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8753 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8754 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8755 	}
8756 
8757 	/*
8758 	 * Built-in Cyclops always on policy for WiFi infra
8759 	 */
8760 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8761 		errno_t error;
8762 
8763 		error = if_set_qosmarking_mode(ifp,
8764 		    IFRTYPE_QOSMARKING_FASTLANE);
8765 		if (error != 0) {
8766 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8767 			    __func__, ifp->if_xname, error);
8768 		} else {
8769 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8770 #if (DEVELOPMENT || DEBUG)
8771 			DLIL_PRINTF("%s fastlane enabled on %s\n",
8772 			    __func__, ifp->if_xname);
8773 #endif /* (DEVELOPMENT || DEBUG) */
8774 		}
8775 	}
8776 
8777 	ifnet_lock_done(ifp);
8778 	ifnet_head_done();
8779 
8780 #if SKYWALK
8781 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8782 #endif /* SKYWALK */
8783 
8784 	lck_mtx_lock(&ifp->if_cached_route_lock);
8785 	/* Enable forwarding cached route */
8786 	ifp->if_fwd_cacheok = 1;
8787 	/* Clean up any existing cached routes */
8788 	ROUTE_RELEASE(&ifp->if_fwd_route);
8789 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8790 	ROUTE_RELEASE(&ifp->if_src_route);
8791 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8792 	ROUTE_RELEASE(&ifp->if_src_route6);
8793 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8794 	lck_mtx_unlock(&ifp->if_cached_route_lock);
8795 
8796 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8797 
8798 	/*
8799 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8800 	 * and trees; do this before the ifnet is marked as attached.
8801 	 * The ifnet keeps the reference to the info structures even after
8802 	 * the ifnet is detached, since the network-layer records still
8803 	 * refer to the info structures even after that.  This also
8804 	 * makes it possible for them to still function after the ifnet
8805 	 * is recycled or reattached.
8806 	 */
8807 #if INET
8808 	if (IGMP_IFINFO(ifp) == NULL) {
8809 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
8810 		VERIFY(IGMP_IFINFO(ifp) != NULL);
8811 	} else {
8812 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
8813 		igmp_domifreattach(IGMP_IFINFO(ifp));
8814 	}
8815 #endif /* INET */
8816 	if (MLD_IFINFO(ifp) == NULL) {
8817 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
8818 		VERIFY(MLD_IFINFO(ifp) != NULL);
8819 	} else {
8820 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
8821 		mld_domifreattach(MLD_IFINFO(ifp));
8822 	}
8823 
8824 	VERIFY(ifp->if_data_threshold == 0);
8825 	VERIFY(ifp->if_dt_tcall != NULL);
8826 
8827 	/*
8828 	 * Wait for the created kernel threads for I/O to get
8829 	 * scheduled and run at least once before we proceed
8830 	 * to mark interface as attached.
8831 	 */
8832 	lck_mtx_lock(&ifp->if_ref_lock);
8833 	while (ifp->if_threads_pending != 0) {
8834 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
8835 		    "interface %s to get scheduled at least once.\n",
8836 		    __func__, ifp->if_xname);
8837 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
8838 		    __func__, NULL);
8839 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
8840 	}
8841 	lck_mtx_unlock(&ifp->if_ref_lock);
8842 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
8843 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
8844 
8845 	/* Final mark this ifnet as attached. */
8846 	ifnet_lock_exclusive(ifp);
8847 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8848 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
8849 	lck_mtx_unlock(&ifp->if_ref_lock);
8850 	if (net_rtref) {
8851 		/* boot-args override; enable idle notification */
8852 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
8853 		    IFRF_IDLE_NOTIFY);
8854 	} else {
8855 		/* apply previous request(s) to set the idle flags, if any */
8856 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
8857 		    ifp->if_idle_new_flags_mask);
8858 	}
8859 #if SKYWALK
8860 	/* the interface is fully attached; let the nexus adapter know */
8861 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
8862 		if (netif_compat) {
8863 			if (sk_netif_compat_txmodel ==
8864 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
8865 				ifnet_enqueue_multi_setup(ifp,
8866 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
8867 			}
8868 			ifp->if_nx_netif = nexus_netif;
8869 		}
8870 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
8871 	}
8872 #endif /* SKYWALK */
8873 	ifnet_lock_done(ifp);
8874 	dlil_if_unlock();
8875 
8876 #if PF
8877 	/*
8878 	 * Attach packet filter to this interface, if enabled.
8879 	 */
8880 	pf_ifnet_hook(ifp, 1);
8881 #endif /* PF */
8882 
8883 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
8884 
8885 	if (dlil_verbose) {
8886 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
8887 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
8888 	}
8889 
8890 	return 0;
8891 }
8892 
8893 /*
8894  * Prepare the storage for the first/permanent link address, which must
8895  * must have the same lifetime as the ifnet itself.  Although the link
8896  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
8897  * its location in memory must never change as it may still be referred
8898  * to by some parts of the system afterwards (unfortunate implementation
8899  * artifacts inherited from BSD.)
8900  *
8901  * Caller must hold ifnet lock as writer.
8902  */
8903 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)8904 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
8905 {
8906 	struct ifaddr *ifa, *oifa;
8907 	struct sockaddr_dl *asdl, *msdl;
8908 	char workbuf[IFNAMSIZ * 2];
8909 	int namelen, masklen, socksize;
8910 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8911 
8912 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
8913 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
8914 
8915 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
8916 	    if_name(ifp));
8917 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
8918 	    + ((namelen > 0) ? namelen : 0);
8919 	socksize = masklen + ifp->if_addrlen;
8920 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
8921 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
8922 		socksize = sizeof(struct sockaddr_dl);
8923 	}
8924 	socksize = ROUNDUP(socksize);
8925 #undef ROUNDUP
8926 
8927 	ifa = ifp->if_lladdr;
8928 	if (socksize > DLIL_SDLMAXLEN ||
8929 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
8930 		/*
8931 		 * Rare, but in the event that the link address requires
8932 		 * more storage space than DLIL_SDLMAXLEN, allocate the
8933 		 * largest possible storages for address and mask, such
8934 		 * that we can reuse the same space when if_addrlen grows.
8935 		 * This same space will be used when if_addrlen shrinks.
8936 		 */
8937 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
8938 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
8939 
8940 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
8941 			ifa_lock_init(ifa);
8942 			/* Don't set IFD_ALLOC, as this is permanent */
8943 			ifa->ifa_debug = IFD_LINK;
8944 		}
8945 		IFA_LOCK(ifa);
8946 		/* address and mask sockaddr_dl locations */
8947 		asdl = (struct sockaddr_dl *)(ifa + 1);
8948 		bzero(asdl, SOCK_MAXADDRLEN);
8949 		msdl = (struct sockaddr_dl *)(void *)
8950 		    ((char *)asdl + SOCK_MAXADDRLEN);
8951 		bzero(msdl, SOCK_MAXADDRLEN);
8952 	} else {
8953 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
8954 		/*
8955 		 * Use the storage areas for address and mask within the
8956 		 * dlil_ifnet structure.  This is the most common case.
8957 		 */
8958 		if (ifa == NULL) {
8959 			ifa = &dl_if->dl_if_lladdr.ifa;
8960 			ifa_lock_init(ifa);
8961 			/* Don't set IFD_ALLOC, as this is permanent */
8962 			ifa->ifa_debug = IFD_LINK;
8963 		}
8964 		IFA_LOCK(ifa);
8965 		/* address and mask sockaddr_dl locations */
8966 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
8967 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
8968 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
8969 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
8970 	}
8971 
8972 	/* hold a permanent reference for the ifnet itself */
8973 	IFA_ADDREF_LOCKED(ifa);
8974 	oifa = ifp->if_lladdr;
8975 	ifp->if_lladdr = ifa;
8976 
8977 	VERIFY(ifa->ifa_debug == IFD_LINK);
8978 	ifa->ifa_ifp = ifp;
8979 	ifa->ifa_rtrequest = link_rtrequest;
8980 	ifa->ifa_addr = (struct sockaddr *)asdl;
8981 	asdl->sdl_len = (u_char)socksize;
8982 	asdl->sdl_family = AF_LINK;
8983 	if (namelen > 0) {
8984 		bcopy(workbuf, asdl->sdl_data, min(namelen,
8985 		    sizeof(asdl->sdl_data)));
8986 		asdl->sdl_nlen = (u_char)namelen;
8987 	} else {
8988 		asdl->sdl_nlen = 0;
8989 	}
8990 	asdl->sdl_index = ifp->if_index;
8991 	asdl->sdl_type = ifp->if_type;
8992 	if (ll_addr != NULL) {
8993 		asdl->sdl_alen = ll_addr->sdl_alen;
8994 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
8995 	} else {
8996 		asdl->sdl_alen = 0;
8997 	}
8998 	ifa->ifa_netmask = (struct sockaddr *)msdl;
8999 	msdl->sdl_len = (u_char)masklen;
9000 	while (namelen > 0) {
9001 		msdl->sdl_data[--namelen] = 0xff;
9002 	}
9003 	IFA_UNLOCK(ifa);
9004 
9005 	if (oifa != NULL) {
9006 		IFA_REMREF(oifa);
9007 	}
9008 
9009 	return ifa;
9010 }
9011 
9012 static void
if_purgeaddrs(struct ifnet * ifp)9013 if_purgeaddrs(struct ifnet *ifp)
9014 {
9015 #if INET
9016 	in_purgeaddrs(ifp);
9017 #endif /* INET */
9018 	in6_purgeaddrs(ifp);
9019 }
9020 
9021 errno_t
ifnet_detach(ifnet_t ifp)9022 ifnet_detach(ifnet_t ifp)
9023 {
9024 	struct ifnet *delegated_ifp;
9025 	struct nd_ifinfo *ndi = NULL;
9026 
9027 	if (ifp == NULL) {
9028 		return EINVAL;
9029 	}
9030 
9031 	ndi = ND_IFINFO(ifp);
9032 	if (NULL != ndi) {
9033 		ndi->cga_initialized = FALSE;
9034 	}
9035 
9036 	/* Mark the interface down */
9037 	if_down(ifp);
9038 
9039 	/*
9040 	 * IMPORTANT NOTE
9041 	 *
9042 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9043 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9044 	 * until after we've waited for all I/O references to drain
9045 	 * in ifnet_detach_final().
9046 	 */
9047 
9048 	ifnet_head_lock_exclusive();
9049 	ifnet_lock_exclusive(ifp);
9050 
9051 	if (ifp->if_output_netem != NULL) {
9052 		netem_destroy(ifp->if_output_netem);
9053 		ifp->if_output_netem = NULL;
9054 	}
9055 
9056 	/*
9057 	 * Check to see if this interface has previously triggered
9058 	 * aggressive protocol draining; if so, decrement the global
9059 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9060 	 * there are no more of such an interface around.
9061 	 */
9062 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9063 
9064 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9065 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9066 		lck_mtx_unlock(&ifp->if_ref_lock);
9067 		ifnet_lock_done(ifp);
9068 		ifnet_head_done();
9069 		return EINVAL;
9070 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9071 		/* Interface has already been detached */
9072 		lck_mtx_unlock(&ifp->if_ref_lock);
9073 		ifnet_lock_done(ifp);
9074 		ifnet_head_done();
9075 		return ENXIO;
9076 	}
9077 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9078 	/* Indicate this interface is being detached */
9079 	ifp->if_refflags &= ~IFRF_ATTACHED;
9080 	ifp->if_refflags |= IFRF_DETACHING;
9081 	lck_mtx_unlock(&ifp->if_ref_lock);
9082 
9083 	if (dlil_verbose) {
9084 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9085 	}
9086 
9087 	/* clean up flow control entry object if there's any */
9088 	if (ifp->if_eflags & IFEF_TXSTART) {
9089 		ifnet_flowadv(ifp->if_flowhash);
9090 	}
9091 
9092 	/* Reset ECN enable/disable flags */
9093 	/* Reset CLAT46 flag */
9094 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9095 
9096 	/*
9097 	 * We do not reset the TCP keep alive counters in case
9098 	 * a TCP connection stays connection after the interface
9099 	 * went down
9100 	 */
9101 	if (ifp->if_tcp_kao_cnt > 0) {
9102 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9103 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9104 	}
9105 	ifp->if_tcp_kao_max = 0;
9106 
9107 	/*
9108 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9109 	 * no longer be visible during lookups from this point.
9110 	 */
9111 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9112 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9113 	ifp->if_link.tqe_next = NULL;
9114 	ifp->if_link.tqe_prev = NULL;
9115 	if (ifp->if_ordered_link.tqe_next != NULL ||
9116 	    ifp->if_ordered_link.tqe_prev != NULL) {
9117 		ifnet_remove_from_ordered_list(ifp);
9118 	}
9119 	ifindex2ifnet[ifp->if_index] = NULL;
9120 
9121 	/* 18717626 - reset router mode */
9122 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9123 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9124 
9125 	/* Record detach PC stacktrace */
9126 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9127 
9128 	/* Clear logging parameters */
9129 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9130 
9131 	/* Clear delegated interface info (reference released below) */
9132 	delegated_ifp = ifp->if_delegated.ifp;
9133 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9134 
9135 	/* Reset interface state */
9136 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9137 
9138 	ifnet_lock_done(ifp);
9139 	ifnet_head_done();
9140 
9141 	/* Release reference held on the delegated interface */
9142 	if (delegated_ifp != NULL) {
9143 		ifnet_release(delegated_ifp);
9144 	}
9145 
9146 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9147 	if (ifp != lo_ifp) {
9148 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9149 	}
9150 
9151 	/* Reset TCP local statistics */
9152 	if (ifp->if_tcp_stat != NULL) {
9153 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9154 	}
9155 
9156 	/* Reset UDP local statistics */
9157 	if (ifp->if_udp_stat != NULL) {
9158 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9159 	}
9160 
9161 	/* Reset ifnet IPv4 stats */
9162 	if (ifp->if_ipv4_stat != NULL) {
9163 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9164 	}
9165 
9166 	/* Reset ifnet IPv6 stats */
9167 	if (ifp->if_ipv6_stat != NULL) {
9168 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9169 	}
9170 
9171 	/* Release memory held for interface link status report */
9172 	if (ifp->if_link_status != NULL) {
9173 		kfree_type(struct if_link_status, ifp->if_link_status);
9174 		ifp->if_link_status = NULL;
9175 	}
9176 
9177 	/* Let BPF know we're detaching */
9178 	bpfdetach(ifp);
9179 
9180 	/* Disable forwarding cached route */
9181 	lck_mtx_lock(&ifp->if_cached_route_lock);
9182 	ifp->if_fwd_cacheok = 0;
9183 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9184 
9185 	/* Disable data threshold and wait for any pending event posting */
9186 	ifp->if_data_threshold = 0;
9187 	VERIFY(ifp->if_dt_tcall != NULL);
9188 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9189 
9190 	/*
9191 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9192 	 * references to the info structures and leave them attached to
9193 	 * this ifnet.
9194 	 */
9195 #if INET
9196 	igmp_domifdetach(ifp);
9197 #endif /* INET */
9198 	mld_domifdetach(ifp);
9199 
9200 #if SKYWALK
9201 	/* Clean up any netns tokens still pointing to to this ifnet */
9202 	netns_ifnet_detach(ifp);
9203 #endif /* SKYWALK */
9204 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9205 
9206 	/* Let worker thread take care of the rest, to avoid reentrancy */
9207 	dlil_if_lock();
9208 	ifnet_detaching_enqueue(ifp);
9209 	dlil_if_unlock();
9210 
9211 	return 0;
9212 }
9213 
9214 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9215 ifnet_detaching_enqueue(struct ifnet *ifp)
9216 {
9217 	dlil_if_lock_assert();
9218 
9219 	++ifnet_detaching_cnt;
9220 	VERIFY(ifnet_detaching_cnt != 0);
9221 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9222 	wakeup((caddr_t)&ifnet_delayed_run);
9223 }
9224 
9225 static struct ifnet *
ifnet_detaching_dequeue(void)9226 ifnet_detaching_dequeue(void)
9227 {
9228 	struct ifnet *ifp;
9229 
9230 	dlil_if_lock_assert();
9231 
9232 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9233 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9234 	if (ifp != NULL) {
9235 		VERIFY(ifnet_detaching_cnt != 0);
9236 		--ifnet_detaching_cnt;
9237 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9238 		ifp->if_detaching_link.tqe_next = NULL;
9239 		ifp->if_detaching_link.tqe_prev = NULL;
9240 	}
9241 	return ifp;
9242 }
9243 
9244 __attribute__((noreturn))
9245 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9246 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9247 {
9248 #pragma unused(v, wres)
9249 	struct ifnet *ifp;
9250 
9251 	dlil_if_lock();
9252 	if (__improbable(ifnet_detaching_embryonic)) {
9253 		ifnet_detaching_embryonic = FALSE;
9254 		/* there's no lock ordering constrain so OK to do this here */
9255 		dlil_decr_pending_thread_count();
9256 	}
9257 
9258 	for (;;) {
9259 		dlil_if_lock_assert();
9260 
9261 		if (ifnet_detaching_cnt == 0) {
9262 			break;
9263 		}
9264 
9265 		net_update_uptime();
9266 
9267 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9268 
9269 		/* Take care of detaching ifnet */
9270 		ifp = ifnet_detaching_dequeue();
9271 		if (ifp != NULL) {
9272 			dlil_if_unlock();
9273 			ifnet_detach_final(ifp);
9274 			dlil_if_lock();
9275 		}
9276 	}
9277 
9278 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9279 	dlil_if_unlock();
9280 	(void) thread_block(ifnet_detacher_thread_cont);
9281 
9282 	VERIFY(0);      /* we should never get here */
9283 	/* NOTREACHED */
9284 	__builtin_unreachable();
9285 }
9286 
9287 __dead2
9288 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9289 ifnet_detacher_thread_func(void *v, wait_result_t w)
9290 {
9291 #pragma unused(v, w)
9292 	dlil_if_lock();
9293 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9294 	ifnet_detaching_embryonic = TRUE;
9295 	/* wake up once to get out of embryonic state */
9296 	wakeup((caddr_t)&ifnet_delayed_run);
9297 	dlil_if_unlock();
9298 	(void) thread_block(ifnet_detacher_thread_cont);
9299 	VERIFY(0);
9300 	/* NOTREACHED */
9301 	__builtin_unreachable();
9302 }
9303 
9304 static void
ifnet_detach_final(struct ifnet * ifp)9305 ifnet_detach_final(struct ifnet *ifp)
9306 {
9307 	struct ifnet_filter *filter, *filter_next;
9308 	struct dlil_ifnet *dlifp;
9309 	struct ifnet_filter_head fhead;
9310 	struct dlil_threading_info *inp;
9311 	struct ifaddr *ifa;
9312 	ifnet_detached_func if_free;
9313 	int i;
9314 
9315 #if SKYWALK
9316 	/*
9317 	 * Wait for the datapath to quiesce before tearing down
9318 	 * netif/flowswitch nexuses.
9319 	 */
9320 	dlil_quiesce_and_detach_nexuses(ifp);
9321 #endif /* SKYWALK */
9322 
9323 	lck_mtx_lock(&ifp->if_ref_lock);
9324 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9325 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9326 		    __func__, ifp);
9327 		/* NOTREACHED */
9328 	}
9329 
9330 	/*
9331 	 * Wait until the existing IO references get released
9332 	 * before we proceed with ifnet_detach.  This is not a
9333 	 * common case, so block without using a continuation.
9334 	 */
9335 	while (ifp->if_refio > 0) {
9336 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9337 		    "to be released\n", __func__, if_name(ifp));
9338 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9339 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9340 	}
9341 
9342 	VERIFY(ifp->if_datamov == 0);
9343 	VERIFY(ifp->if_drainers == 0);
9344 	VERIFY(ifp->if_suspend == 0);
9345 	ifp->if_refflags &= ~IFRF_READY;
9346 	lck_mtx_unlock(&ifp->if_ref_lock);
9347 
9348 	/* Clear agent IDs */
9349 	if (ifp->if_agentids != NULL) {
9350 		kfree_data(ifp->if_agentids,
9351 		    sizeof(uuid_t) * ifp->if_agentcount);
9352 		ifp->if_agentids = NULL;
9353 	}
9354 	ifp->if_agentcount = 0;
9355 
9356 #if SKYWALK
9357 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9358 #endif /* SKYWALK */
9359 	/* Drain and destroy send queue */
9360 	ifclassq_teardown(ifp->if_snd);
9361 
9362 	/* Detach interface filters */
9363 	lck_mtx_lock(&ifp->if_flt_lock);
9364 	if_flt_monitor_enter(ifp);
9365 
9366 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9367 	fhead = ifp->if_flt_head;
9368 	TAILQ_INIT(&ifp->if_flt_head);
9369 
9370 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9371 		filter_next = TAILQ_NEXT(filter, filt_next);
9372 		lck_mtx_unlock(&ifp->if_flt_lock);
9373 
9374 		dlil_detach_filter_internal(filter, 1);
9375 		lck_mtx_lock(&ifp->if_flt_lock);
9376 	}
9377 	if_flt_monitor_leave(ifp);
9378 	lck_mtx_unlock(&ifp->if_flt_lock);
9379 
9380 	/* Tell upper layers to drop their network addresses */
9381 	if_purgeaddrs(ifp);
9382 
9383 	ifnet_lock_exclusive(ifp);
9384 
9385 	/* Unplumb all protocols */
9386 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9387 		struct if_proto *proto;
9388 
9389 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9390 		while (proto != NULL) {
9391 			protocol_family_t family = proto->protocol_family;
9392 			ifnet_lock_done(ifp);
9393 			proto_unplumb(family, ifp);
9394 			ifnet_lock_exclusive(ifp);
9395 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9396 		}
9397 		/* There should not be any protocols left */
9398 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9399 	}
9400 	zfree(dlif_phash_zone, ifp->if_proto_hash);
9401 	ifp->if_proto_hash = NULL;
9402 
9403 	/* Detach (permanent) link address from if_addrhead */
9404 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9405 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9406 	IFA_LOCK(ifa);
9407 	if_detach_link_ifa(ifp, ifa);
9408 	IFA_UNLOCK(ifa);
9409 
9410 	/* Remove (permanent) link address from ifnet_addrs[] */
9411 	IFA_REMREF(ifa);
9412 	ifnet_addrs[ifp->if_index - 1] = NULL;
9413 
9414 	/* This interface should not be on {ifnet_head,detaching} */
9415 	VERIFY(ifp->if_link.tqe_next == NULL);
9416 	VERIFY(ifp->if_link.tqe_prev == NULL);
9417 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9418 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9419 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9420 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9421 
9422 	/* The slot should have been emptied */
9423 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9424 
9425 	/* There should not be any addresses left */
9426 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9427 
9428 	/*
9429 	 * Signal the starter thread to terminate itself, and wait until
9430 	 * it has exited.
9431 	 */
9432 	if (ifp->if_start_thread != THREAD_NULL) {
9433 		lck_mtx_lock_spin(&ifp->if_start_lock);
9434 		ifp->if_start_flags |= IFSF_TERMINATING;
9435 		wakeup_one((caddr_t)&ifp->if_start_thread);
9436 		lck_mtx_unlock(&ifp->if_start_lock);
9437 
9438 		/* wait for starter thread to terminate */
9439 		lck_mtx_lock(&ifp->if_start_lock);
9440 		while (ifp->if_start_thread != THREAD_NULL) {
9441 			if (dlil_verbose) {
9442 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9443 				    __func__,
9444 				    if_name(ifp));
9445 			}
9446 			(void) msleep(&ifp->if_start_thread,
9447 			    &ifp->if_start_lock, (PZERO - 1),
9448 			    "ifnet_start_thread_exit", NULL);
9449 		}
9450 		lck_mtx_unlock(&ifp->if_start_lock);
9451 		if (dlil_verbose) {
9452 			DLIL_PRINTF("%s: %s starter thread termination complete",
9453 			    __func__, if_name(ifp));
9454 		}
9455 	}
9456 
9457 	/*
9458 	 * Signal the poller thread to terminate itself, and wait until
9459 	 * it has exited.
9460 	 */
9461 	if (ifp->if_poll_thread != THREAD_NULL) {
9462 #if SKYWALK
9463 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9464 #endif /* SKYWALK */
9465 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9466 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9467 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9468 		lck_mtx_unlock(&ifp->if_poll_lock);
9469 
9470 		/* wait for poller thread to terminate */
9471 		lck_mtx_lock(&ifp->if_poll_lock);
9472 		while (ifp->if_poll_thread != THREAD_NULL) {
9473 			if (dlil_verbose) {
9474 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9475 				    __func__,
9476 				    if_name(ifp));
9477 			}
9478 			(void) msleep(&ifp->if_poll_thread,
9479 			    &ifp->if_poll_lock, (PZERO - 1),
9480 			    "ifnet_poll_thread_exit", NULL);
9481 		}
9482 		lck_mtx_unlock(&ifp->if_poll_lock);
9483 		if (dlil_verbose) {
9484 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9485 			    __func__, if_name(ifp));
9486 		}
9487 	}
9488 
9489 	/*
9490 	 * If thread affinity was set for the workloop thread, we will need
9491 	 * to tear down the affinity and release the extra reference count
9492 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9493 	 * without dedicated input threads.
9494 	 */
9495 	if ((inp = ifp->if_inp) != NULL) {
9496 		VERIFY(inp != dlil_main_input_thread);
9497 
9498 		if (inp->dlth_affinity) {
9499 			struct thread *tp, *wtp, *ptp;
9500 
9501 			lck_mtx_lock_spin(&inp->dlth_lock);
9502 			wtp = inp->dlth_driver_thread;
9503 			inp->dlth_driver_thread = THREAD_NULL;
9504 			ptp = inp->dlth_poller_thread;
9505 			inp->dlth_poller_thread = THREAD_NULL;
9506 			ASSERT(inp->dlth_thread != THREAD_NULL);
9507 			tp = inp->dlth_thread;    /* don't nullify now */
9508 			inp->dlth_affinity_tag = 0;
9509 			inp->dlth_affinity = FALSE;
9510 			lck_mtx_unlock(&inp->dlth_lock);
9511 
9512 			/* Tear down poll thread affinity */
9513 			if (ptp != NULL) {
9514 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9515 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9516 				(void) dlil_affinity_set(ptp,
9517 				    THREAD_AFFINITY_TAG_NULL);
9518 				thread_deallocate(ptp);
9519 			}
9520 
9521 			/* Tear down workloop thread affinity */
9522 			if (wtp != NULL) {
9523 				(void) dlil_affinity_set(wtp,
9524 				    THREAD_AFFINITY_TAG_NULL);
9525 				thread_deallocate(wtp);
9526 			}
9527 
9528 			/* Tear down DLIL input thread affinity */
9529 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9530 			thread_deallocate(tp);
9531 		}
9532 
9533 		/* disassociate ifp DLIL input thread */
9534 		ifp->if_inp = NULL;
9535 
9536 		/* if the worker thread was created, tell it to terminate */
9537 		if (inp->dlth_thread != THREAD_NULL) {
9538 			lck_mtx_lock_spin(&inp->dlth_lock);
9539 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9540 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9541 				wakeup_one((caddr_t)&inp->dlth_flags);
9542 			}
9543 			lck_mtx_unlock(&inp->dlth_lock);
9544 			ifnet_lock_done(ifp);
9545 
9546 			/* wait for the input thread to terminate */
9547 			lck_mtx_lock_spin(&inp->dlth_lock);
9548 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9549 			    == 0) {
9550 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9551 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9552 			}
9553 			lck_mtx_unlock(&inp->dlth_lock);
9554 			ifnet_lock_exclusive(ifp);
9555 		}
9556 
9557 		/* clean-up input thread state */
9558 		dlil_clean_threading_info(inp);
9559 		/* clean-up poll parameters */
9560 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9561 		dlil_reset_rxpoll_params(ifp);
9562 	}
9563 
9564 	/* The driver might unload, so point these to ourselves */
9565 	if_free = ifp->if_free;
9566 	ifp->if_output_dlil = ifp_if_output;
9567 	ifp->if_output = ifp_if_output;
9568 	ifp->if_pre_enqueue = ifp_if_output;
9569 	ifp->if_start = ifp_if_start;
9570 	ifp->if_output_ctl = ifp_if_ctl;
9571 	ifp->if_input_dlil = ifp_if_input;
9572 	ifp->if_input_poll = ifp_if_input_poll;
9573 	ifp->if_input_ctl = ifp_if_ctl;
9574 	ifp->if_ioctl = ifp_if_ioctl;
9575 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9576 	ifp->if_free = ifp_if_free;
9577 	ifp->if_demux = ifp_if_demux;
9578 	ifp->if_event = ifp_if_event;
9579 	ifp->if_framer_legacy = ifp_if_framer;
9580 	ifp->if_framer = ifp_if_framer_extended;
9581 	ifp->if_add_proto = ifp_if_add_proto;
9582 	ifp->if_del_proto = ifp_if_del_proto;
9583 	ifp->if_check_multi = ifp_if_check_multi;
9584 
9585 	/* wipe out interface description */
9586 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9587 	ifp->if_desc.ifd_len = 0;
9588 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9589 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9590 
9591 	/* there shouldn't be any delegation by now */
9592 	VERIFY(ifp->if_delegated.ifp == NULL);
9593 	VERIFY(ifp->if_delegated.type == 0);
9594 	VERIFY(ifp->if_delegated.family == 0);
9595 	VERIFY(ifp->if_delegated.subfamily == 0);
9596 	VERIFY(ifp->if_delegated.expensive == 0);
9597 	VERIFY(ifp->if_delegated.constrained == 0);
9598 
9599 	/* QoS marking get cleared */
9600 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9601 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9602 
9603 #if SKYWALK
9604 	/* the nexus destructor is responsible for clearing these */
9605 	VERIFY(ifp->if_na_ops == NULL);
9606 	VERIFY(ifp->if_na == NULL);
9607 #endif /* SKYWALK */
9608 
9609 	/* promiscuous count needs to start at zero again */
9610 	ifp->if_pcount = 0;
9611 	ifp->if_flags &= ~IFF_PROMISC;
9612 
9613 	ifnet_lock_done(ifp);
9614 
9615 #if PF
9616 	/*
9617 	 * Detach this interface from packet filter, if enabled.
9618 	 */
9619 	pf_ifnet_hook(ifp, 0);
9620 #endif /* PF */
9621 
9622 	/* Filter list should be empty */
9623 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9624 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9625 	VERIFY(ifp->if_flt_busy == 0);
9626 	VERIFY(ifp->if_flt_waiters == 0);
9627 	VERIFY(ifp->if_flt_non_os_count == 0);
9628 	VERIFY(ifp->if_flt_no_tso_count == 0);
9629 	lck_mtx_unlock(&ifp->if_flt_lock);
9630 
9631 	/* Last chance to drain send queue */
9632 	if_qflush_snd(ifp, 0);
9633 
9634 	/* Last chance to cleanup any cached route */
9635 	lck_mtx_lock(&ifp->if_cached_route_lock);
9636 	VERIFY(!ifp->if_fwd_cacheok);
9637 	ROUTE_RELEASE(&ifp->if_fwd_route);
9638 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9639 	ROUTE_RELEASE(&ifp->if_src_route);
9640 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9641 	ROUTE_RELEASE(&ifp->if_src_route6);
9642 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9643 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9644 
9645 	VERIFY(ifp->if_data_threshold == 0);
9646 	VERIFY(ifp->if_dt_tcall != NULL);
9647 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9648 
9649 	ifnet_llreach_ifdetach(ifp);
9650 
9651 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9652 
9653 	/*
9654 	 * Finally, mark this ifnet as detached.
9655 	 */
9656 	if (dlil_verbose) {
9657 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9658 	}
9659 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9660 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9661 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9662 		    __func__, ifp);
9663 		/* NOTREACHED */
9664 	}
9665 	ifp->if_refflags &= ~IFRF_DETACHING;
9666 	lck_mtx_unlock(&ifp->if_ref_lock);
9667 	if (if_free != NULL) {
9668 		if_free(ifp);
9669 	}
9670 
9671 	ifclassq_release(&ifp->if_snd);
9672 
9673 	/* we're fully detached, clear the "in use" bit */
9674 	dlifp = (struct dlil_ifnet *)ifp;
9675 	lck_mtx_lock(&dlifp->dl_if_lock);
9676 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9677 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9678 	lck_mtx_unlock(&dlifp->dl_if_lock);
9679 
9680 	/* Release reference held during ifnet attach */
9681 	ifnet_release(ifp);
9682 }
9683 
9684 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9685 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9686 {
9687 #pragma unused(ifp)
9688 	m_freem_list(m);
9689 	return 0;
9690 }
9691 
9692 void
ifp_if_start(struct ifnet * ifp)9693 ifp_if_start(struct ifnet *ifp)
9694 {
9695 	ifnet_purge(ifp);
9696 }
9697 
9698 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9699 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9700     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9701     boolean_t poll, struct thread *tp)
9702 {
9703 #pragma unused(ifp, m_tail, s, poll, tp)
9704 	m_freem_list(m_head);
9705 	return ENXIO;
9706 }
9707 
9708 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9709 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9710     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9711 {
9712 #pragma unused(ifp, flags, max_cnt)
9713 	if (m_head != NULL) {
9714 		*m_head = NULL;
9715 	}
9716 	if (m_tail != NULL) {
9717 		*m_tail = NULL;
9718 	}
9719 	if (cnt != NULL) {
9720 		*cnt = 0;
9721 	}
9722 	if (len != NULL) {
9723 		*len = 0;
9724 	}
9725 }
9726 
9727 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9728 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9729 {
9730 #pragma unused(ifp, cmd, arglen, arg)
9731 	return EOPNOTSUPP;
9732 }
9733 
9734 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9735 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9736 {
9737 #pragma unused(ifp, fh, pf)
9738 	m_freem(m);
9739 	return EJUSTRETURN;
9740 }
9741 
9742 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9743 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9744     const struct ifnet_demux_desc *da, u_int32_t dc)
9745 {
9746 #pragma unused(ifp, pf, da, dc)
9747 	return EINVAL;
9748 }
9749 
9750 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9751 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9752 {
9753 #pragma unused(ifp, pf)
9754 	return EINVAL;
9755 }
9756 
9757 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9758 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9759 {
9760 #pragma unused(ifp, sa)
9761 	return EOPNOTSUPP;
9762 }
9763 
9764 #if !XNU_TARGET_OS_OSX
9765 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9766 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9767     const struct sockaddr *sa, const char *ll, const char *t,
9768     u_int32_t *pre, u_int32_t *post)
9769 #else /* XNU_TARGET_OS_OSX */
9770 static errno_t
9771 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9772     const struct sockaddr *sa, const char *ll, const char *t)
9773 #endif /* XNU_TARGET_OS_OSX */
9774 {
9775 #pragma unused(ifp, m, sa, ll, t)
9776 #if !XNU_TARGET_OS_OSX
9777 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9778 #else /* XNU_TARGET_OS_OSX */
9779 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
9780 #endif /* XNU_TARGET_OS_OSX */
9781 }
9782 
9783 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9784 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
9785     const struct sockaddr *sa, const char *ll, const char *t,
9786     u_int32_t *pre, u_int32_t *post)
9787 {
9788 #pragma unused(ifp, sa, ll, t)
9789 	m_freem(*m);
9790 	*m = NULL;
9791 
9792 	if (pre != NULL) {
9793 		*pre = 0;
9794 	}
9795 	if (post != NULL) {
9796 		*post = 0;
9797 	}
9798 
9799 	return EJUSTRETURN;
9800 }
9801 
9802 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)9803 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
9804 {
9805 #pragma unused(ifp, cmd, arg)
9806 	return EOPNOTSUPP;
9807 }
9808 
9809 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)9810 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
9811 {
9812 #pragma unused(ifp, tm, f)
9813 	/* XXX not sure what to do here */
9814 	return 0;
9815 }
9816 
9817 static void
ifp_if_free(struct ifnet * ifp)9818 ifp_if_free(struct ifnet *ifp)
9819 {
9820 #pragma unused(ifp)
9821 }
9822 
9823 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)9824 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
9825 {
9826 #pragma unused(ifp, e)
9827 }
9828 
9829 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)9830 dlil_if_acquire(u_int32_t family, const void *uniqueid,
9831     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
9832 {
9833 	struct ifnet *ifp1 = NULL;
9834 	struct dlil_ifnet *dlifp1 = NULL;
9835 	struct dlil_ifnet *dlifp1_saved = NULL;
9836 	void *buf, *base, **pbuf;
9837 	int ret = 0;
9838 
9839 	VERIFY(*ifp == NULL);
9840 	dlil_if_lock();
9841 	/*
9842 	 * We absolutely can't have an interface with the same name
9843 	 * in in-use state.
9844 	 * To make sure of that list has to be traversed completely
9845 	 */
9846 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
9847 		ifp1 = (struct ifnet *)dlifp1;
9848 
9849 		if (ifp1->if_family != family) {
9850 			continue;
9851 		}
9852 
9853 		/*
9854 		 * If interface is in use, return EBUSY if either unique id
9855 		 * or interface extended names are the same
9856 		 */
9857 		lck_mtx_lock(&dlifp1->dl_if_lock);
9858 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
9859 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9860 			lck_mtx_unlock(&dlifp1->dl_if_lock);
9861 			ret = EBUSY;
9862 			goto end;
9863 		}
9864 
9865 		if (uniqueid_len != 0 &&
9866 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
9867 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
9868 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9869 				lck_mtx_unlock(&dlifp1->dl_if_lock);
9870 				ret = EBUSY;
9871 				goto end;
9872 			}
9873 			if (dlifp1_saved == NULL) {
9874 				/* cache the first match */
9875 				dlifp1_saved = dlifp1;
9876 			}
9877 			/*
9878 			 * Do not break or jump to end as we have to traverse
9879 			 * the whole list to ensure there are no name collisions
9880 			 */
9881 		}
9882 		lck_mtx_unlock(&dlifp1->dl_if_lock);
9883 	}
9884 
9885 	/* If there's an interface that can be recycled, use that */
9886 	if (dlifp1_saved != NULL) {
9887 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
9888 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
9889 			/* some other thread got in ahead of us */
9890 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9891 			ret = EBUSY;
9892 			goto end;
9893 		}
9894 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
9895 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9896 		*ifp = (struct ifnet *)dlifp1_saved;
9897 		dlil_if_ref(*ifp);
9898 		goto end;
9899 	}
9900 
9901 	/* no interface found, allocate a new one */
9902 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9903 
9904 	/* Get the 64-bit aligned base address for this object */
9905 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
9906 	    sizeof(u_int64_t));
9907 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
9908 
9909 	/*
9910 	 * Wind back a pointer size from the aligned base and
9911 	 * save the original address so we can free it later.
9912 	 */
9913 	pbuf = (void **)((intptr_t)base - sizeof(void *));
9914 	*pbuf = buf;
9915 	dlifp1 = base;
9916 
9917 	if (uniqueid_len) {
9918 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
9919 		    Z_WAITOK);
9920 		if (dlifp1->dl_if_uniqueid == NULL) {
9921 			zfree(dlif_zone, buf);
9922 			ret = ENOMEM;
9923 			goto end;
9924 		}
9925 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
9926 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
9927 	}
9928 
9929 	ifp1 = (struct ifnet *)dlifp1;
9930 	dlifp1->dl_if_flags = DLIF_INUSE;
9931 	if (ifnet_debug) {
9932 		dlifp1->dl_if_flags |= DLIF_DEBUG;
9933 		dlifp1->dl_if_trace = dlil_if_trace;
9934 	}
9935 	ifp1->if_name = dlifp1->dl_if_namestorage;
9936 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
9937 
9938 	/* initialize interface description */
9939 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
9940 	ifp1->if_desc.ifd_len = 0;
9941 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
9942 
9943 #if SKYWALK
9944 	SLIST_INIT(&ifp1->if_netns_tokens);
9945 #endif /* SKYWALK */
9946 
9947 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
9948 		DLIL_PRINTF("%s: failed to allocate if local stats, "
9949 		    "error: %d\n", __func__, ret);
9950 		/* This probably shouldn't be fatal */
9951 		ret = 0;
9952 	}
9953 
9954 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9955 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9956 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
9957 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
9958 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
9959 	    &ifnet_lock_attr);
9960 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
9961 #if INET
9962 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
9963 	    &ifnet_lock_attr);
9964 	ifp1->if_inetdata = NULL;
9965 #endif
9966 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
9967 	ifp1->if_inet6_ioctl_busy = FALSE;
9968 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
9969 	    &ifnet_lock_attr);
9970 	ifp1->if_inet6data = NULL;
9971 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
9972 	    &ifnet_lock_attr);
9973 	ifp1->if_link_status = NULL;
9974 
9975 	/* for send data paths */
9976 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
9977 	    &ifnet_lock_attr);
9978 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
9979 	    &ifnet_lock_attr);
9980 
9981 	/* for receive data paths */
9982 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
9983 	    &ifnet_lock_attr);
9984 
9985 	/* thread call allocation is done with sleeping zalloc */
9986 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
9987 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
9988 	if (ifp1->if_dt_tcall == NULL) {
9989 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
9990 		/* NOTREACHED */
9991 	}
9992 
9993 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
9994 
9995 	*ifp = ifp1;
9996 	dlil_if_ref(*ifp);
9997 
9998 end:
9999 	dlil_if_unlock();
10000 
10001 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10002 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10003 
10004 	return ret;
10005 }
10006 
10007 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10008 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10009 {
10010 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10011 
10012 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10013 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10014 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10015 	}
10016 
10017 	ifnet_lock_exclusive(ifp);
10018 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10019 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10020 		ifp->if_broadcast.length = 0;
10021 		ifp->if_broadcast.u.ptr = NULL;
10022 	}
10023 	lck_mtx_lock(&dlifp->dl_if_lock);
10024 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10025 	ifp->if_name = dlifp->dl_if_namestorage;
10026 	/* Reset external name (name + unit) */
10027 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10028 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10029 	    "%s?", ifp->if_name);
10030 	if (clear_in_use) {
10031 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10032 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10033 	}
10034 	lck_mtx_unlock(&dlifp->dl_if_lock);
10035 	ifnet_lock_done(ifp);
10036 }
10037 
10038 __private_extern__ void
dlil_if_release(ifnet_t ifp)10039 dlil_if_release(ifnet_t ifp)
10040 {
10041 	_dlil_if_release(ifp, false);
10042 }
10043 
10044 __private_extern__ void
dlil_if_lock(void)10045 dlil_if_lock(void)
10046 {
10047 	lck_mtx_lock(&dlil_ifnet_lock);
10048 }
10049 
10050 __private_extern__ void
dlil_if_unlock(void)10051 dlil_if_unlock(void)
10052 {
10053 	lck_mtx_unlock(&dlil_ifnet_lock);
10054 }
10055 
10056 __private_extern__ void
dlil_if_lock_assert(void)10057 dlil_if_lock_assert(void)
10058 {
10059 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10060 }
10061 
10062 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10063 dlil_proto_unplumb_all(struct ifnet *ifp)
10064 {
10065 	/*
10066 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10067 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10068 	 * explicit unplumb.
10069 	 *
10070 	 * if_proto_hash[3] is for other protocols; we expect anything
10071 	 * in this bucket to respond to the DETACHING event (which would
10072 	 * have happened by now) and do the unplumb then.
10073 	 */
10074 	(void) proto_unplumb(PF_INET, ifp);
10075 	(void) proto_unplumb(PF_INET6, ifp);
10076 }
10077 
10078 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10079 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10080 {
10081 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10082 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10083 
10084 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10085 
10086 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10087 }
10088 
10089 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10090 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10091 {
10092 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10093 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10094 
10095 	if (ifp->if_fwd_cacheok) {
10096 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10097 	} else {
10098 		ROUTE_RELEASE(src);
10099 	}
10100 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10101 }
10102 
10103 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10104 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10105 {
10106 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10107 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10108 
10109 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10110 	    sizeof(*dst));
10111 
10112 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10113 }
10114 
10115 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10116 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10117 {
10118 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10119 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10120 
10121 	if (ifp->if_fwd_cacheok) {
10122 		route_copyin((struct route *)src,
10123 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10124 	} else {
10125 		ROUTE_RELEASE(src);
10126 	}
10127 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10128 }
10129 
10130 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10131 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10132 {
10133 	struct route            src_rt;
10134 	struct sockaddr_in      *dst;
10135 
10136 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10137 
10138 	ifp_src_route_copyout(ifp, &src_rt);
10139 
10140 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10141 		ROUTE_RELEASE(&src_rt);
10142 		if (dst->sin_family != AF_INET) {
10143 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10144 			dst->sin_len = sizeof(src_rt.ro_dst);
10145 			dst->sin_family = AF_INET;
10146 		}
10147 		dst->sin_addr = src_ip;
10148 
10149 		VERIFY(src_rt.ro_rt == NULL);
10150 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10151 		    0, 0, ifp->if_index);
10152 
10153 		if (src_rt.ro_rt != NULL) {
10154 			/* retain a ref, copyin consumes one */
10155 			struct rtentry  *rte = src_rt.ro_rt;
10156 			RT_ADDREF(rte);
10157 			ifp_src_route_copyin(ifp, &src_rt);
10158 			src_rt.ro_rt = rte;
10159 		}
10160 	}
10161 
10162 	return src_rt.ro_rt;
10163 }
10164 
10165 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10166 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10167 {
10168 	struct route_in6 src_rt;
10169 
10170 	ifp_src_route6_copyout(ifp, &src_rt);
10171 
10172 	if (ROUTE_UNUSABLE(&src_rt) ||
10173 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10174 		ROUTE_RELEASE(&src_rt);
10175 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10176 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10177 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10178 			src_rt.ro_dst.sin6_family = AF_INET6;
10179 		}
10180 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10181 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10182 		    sizeof(src_rt.ro_dst.sin6_addr));
10183 
10184 		if (src_rt.ro_rt == NULL) {
10185 			src_rt.ro_rt = rtalloc1_scoped(
10186 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10187 				ifp->if_index);
10188 
10189 			if (src_rt.ro_rt != NULL) {
10190 				/* retain a ref, copyin consumes one */
10191 				struct rtentry  *rte = src_rt.ro_rt;
10192 				RT_ADDREF(rte);
10193 				ifp_src_route6_copyin(ifp, &src_rt);
10194 				src_rt.ro_rt = rte;
10195 			}
10196 		}
10197 	}
10198 
10199 	return src_rt.ro_rt;
10200 }
10201 
10202 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10203 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10204 {
10205 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10206 
10207 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10208 
10209 	/* Normalize to edge */
10210 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10211 		lqm = IFNET_LQM_THRESH_ABORT;
10212 		atomic_bitset_32(&tcbinfo.ipi_flags,
10213 		    INPCBINFO_HANDLE_LQM_ABORT);
10214 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10215 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10216 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10217 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10218 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10219 	    lqm <= IFNET_LQM_THRESH_POOR) {
10220 		lqm = IFNET_LQM_THRESH_POOR;
10221 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10222 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10223 		lqm = IFNET_LQM_THRESH_GOOD;
10224 	}
10225 
10226 	/*
10227 	 * Take the lock if needed
10228 	 */
10229 	if (!locked) {
10230 		ifnet_lock_exclusive(ifp);
10231 	}
10232 
10233 	if (lqm == ifp->if_interface_state.lqm_state &&
10234 	    (ifp->if_interface_state.valid_bitmask &
10235 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10236 		/*
10237 		 * Release the lock if was not held by the caller
10238 		 */
10239 		if (!locked) {
10240 			ifnet_lock_done(ifp);
10241 		}
10242 		return;         /* nothing to update */
10243 	}
10244 	ifp->if_interface_state.valid_bitmask |=
10245 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10246 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10247 
10248 	/*
10249 	 * Don't want to hold the lock when issuing kernel events
10250 	 */
10251 	ifnet_lock_done(ifp);
10252 
10253 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10254 	ev_lqm_data.link_quality_metric = lqm;
10255 
10256 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10257 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10258 
10259 	/*
10260 	 * Reacquire the lock for the caller
10261 	 */
10262 	if (locked) {
10263 		ifnet_lock_exclusive(ifp);
10264 	}
10265 }
10266 
10267 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10268 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10269 {
10270 	struct kev_dl_rrc_state kev;
10271 
10272 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10273 	    (ifp->if_interface_state.valid_bitmask &
10274 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10275 		return;
10276 	}
10277 
10278 	ifp->if_interface_state.valid_bitmask |=
10279 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10280 
10281 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10282 
10283 	/*
10284 	 * Don't want to hold the lock when issuing kernel events
10285 	 */
10286 	ifnet_lock_done(ifp);
10287 
10288 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10289 	kev.rrc_state = rrc_state;
10290 
10291 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10292 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10293 
10294 	ifnet_lock_exclusive(ifp);
10295 }
10296 
10297 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10298 if_state_update(struct ifnet *ifp,
10299     struct if_interface_state *if_interface_state)
10300 {
10301 	u_short if_index_available = 0;
10302 
10303 	ifnet_lock_exclusive(ifp);
10304 
10305 	if ((ifp->if_type != IFT_CELLULAR) &&
10306 	    (if_interface_state->valid_bitmask &
10307 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10308 		ifnet_lock_done(ifp);
10309 		return ENOTSUP;
10310 	}
10311 	if ((if_interface_state->valid_bitmask &
10312 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10313 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10314 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10315 		ifnet_lock_done(ifp);
10316 		return EINVAL;
10317 	}
10318 	if ((if_interface_state->valid_bitmask &
10319 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10320 	    if_interface_state->rrc_state !=
10321 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10322 	    if_interface_state->rrc_state !=
10323 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10324 		ifnet_lock_done(ifp);
10325 		return EINVAL;
10326 	}
10327 
10328 	if (if_interface_state->valid_bitmask &
10329 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10330 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10331 	}
10332 	if (if_interface_state->valid_bitmask &
10333 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10334 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10335 	}
10336 	if (if_interface_state->valid_bitmask &
10337 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10338 		ifp->if_interface_state.valid_bitmask |=
10339 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10340 		ifp->if_interface_state.interface_availability =
10341 		    if_interface_state->interface_availability;
10342 
10343 		if (ifp->if_interface_state.interface_availability ==
10344 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10345 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10346 			    __func__, if_name(ifp), ifp->if_index);
10347 			if_index_available = ifp->if_index;
10348 		} else {
10349 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10350 			    __func__, if_name(ifp), ifp->if_index);
10351 		}
10352 	}
10353 	ifnet_lock_done(ifp);
10354 
10355 	/*
10356 	 * Check if the TCP connections going on this interface should be
10357 	 * forced to send probe packets instead of waiting for TCP timers
10358 	 * to fire. This is done on an explicit notification such as
10359 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10360 	 */
10361 	if (if_index_available > 0) {
10362 		tcp_interface_send_probe(if_index_available);
10363 	}
10364 
10365 	return 0;
10366 }
10367 
10368 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10369 if_get_state(struct ifnet *ifp,
10370     struct if_interface_state *if_interface_state)
10371 {
10372 	ifnet_lock_shared(ifp);
10373 
10374 	if_interface_state->valid_bitmask = 0;
10375 
10376 	if (ifp->if_interface_state.valid_bitmask &
10377 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10378 		if_interface_state->valid_bitmask |=
10379 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10380 		if_interface_state->rrc_state =
10381 		    ifp->if_interface_state.rrc_state;
10382 	}
10383 	if (ifp->if_interface_state.valid_bitmask &
10384 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10385 		if_interface_state->valid_bitmask |=
10386 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10387 		if_interface_state->lqm_state =
10388 		    ifp->if_interface_state.lqm_state;
10389 	}
10390 	if (ifp->if_interface_state.valid_bitmask &
10391 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10392 		if_interface_state->valid_bitmask |=
10393 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10394 		if_interface_state->interface_availability =
10395 		    ifp->if_interface_state.interface_availability;
10396 	}
10397 
10398 	ifnet_lock_done(ifp);
10399 }
10400 
10401 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10402 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10403 {
10404 	if (conn_probe > 1) {
10405 		return EINVAL;
10406 	}
10407 	if (conn_probe == 0) {
10408 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10409 	} else {
10410 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10411 	}
10412 
10413 #if NECP
10414 	necp_update_all_clients();
10415 #endif /* NECP */
10416 
10417 	tcp_probe_connectivity(ifp, conn_probe);
10418 	return 0;
10419 }
10420 
10421 /* for uuid.c */
10422 static int
get_ether_index(int * ret_other_index)10423 get_ether_index(int * ret_other_index)
10424 {
10425 	struct ifnet *ifp;
10426 	int en0_index = 0;
10427 	int other_en_index = 0;
10428 	int any_ether_index = 0;
10429 	short best_unit = 0;
10430 
10431 	*ret_other_index = 0;
10432 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10433 		/*
10434 		 * find en0, or if not en0, the lowest unit en*, and if not
10435 		 * that, any ethernet
10436 		 */
10437 		ifnet_lock_shared(ifp);
10438 		if (strcmp(ifp->if_name, "en") == 0) {
10439 			if (ifp->if_unit == 0) {
10440 				/* found en0, we're done */
10441 				en0_index = ifp->if_index;
10442 				ifnet_lock_done(ifp);
10443 				break;
10444 			}
10445 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10446 				other_en_index = ifp->if_index;
10447 				best_unit = ifp->if_unit;
10448 			}
10449 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10450 			any_ether_index = ifp->if_index;
10451 		}
10452 		ifnet_lock_done(ifp);
10453 	}
10454 	if (en0_index == 0) {
10455 		if (other_en_index != 0) {
10456 			*ret_other_index = other_en_index;
10457 		} else if (any_ether_index != 0) {
10458 			*ret_other_index = any_ether_index;
10459 		}
10460 	}
10461 	return en0_index;
10462 }
10463 
10464 int
uuid_get_ethernet(u_int8_t * node)10465 uuid_get_ethernet(u_int8_t *node)
10466 {
10467 	static int en0_index;
10468 	struct ifnet *ifp;
10469 	int other_index = 0;
10470 	int the_index = 0;
10471 	int ret;
10472 
10473 	ifnet_head_lock_shared();
10474 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10475 		en0_index = get_ether_index(&other_index);
10476 	}
10477 	if (en0_index != 0) {
10478 		the_index = en0_index;
10479 	} else if (other_index != 0) {
10480 		the_index = other_index;
10481 	}
10482 	if (the_index != 0) {
10483 		struct dlil_ifnet *dl_if;
10484 
10485 		ifp = ifindex2ifnet[the_index];
10486 		VERIFY(ifp != NULL);
10487 		dl_if = (struct dlil_ifnet *)ifp;
10488 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10489 			/*
10490 			 * Use the permanent ethernet address if it is
10491 			 * available because it will never change.
10492 			 */
10493 			memcpy(node, dl_if->dl_if_permanent_ether,
10494 			    ETHER_ADDR_LEN);
10495 		} else {
10496 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10497 		}
10498 		ret = 0;
10499 	} else {
10500 		ret = -1;
10501 	}
10502 	ifnet_head_done();
10503 	return ret;
10504 }
10505 
10506 static int
10507 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10508 {
10509 #pragma unused(arg1, arg2)
10510 	uint32_t i;
10511 	int err;
10512 
10513 	i = if_rxpoll;
10514 
10515 	err = sysctl_handle_int(oidp, &i, 0, req);
10516 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10517 		return err;
10518 	}
10519 
10520 	if (net_rxpoll == 0) {
10521 		return ENXIO;
10522 	}
10523 
10524 	if_rxpoll = i;
10525 	return err;
10526 }
10527 
10528 static int
10529 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10530 {
10531 #pragma unused(arg1, arg2)
10532 	uint64_t q;
10533 	int err;
10534 
10535 	q = if_rxpoll_mode_holdtime;
10536 
10537 	err = sysctl_handle_quad(oidp, &q, 0, req);
10538 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10539 		return err;
10540 	}
10541 
10542 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10543 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10544 	}
10545 
10546 	if_rxpoll_mode_holdtime = q;
10547 
10548 	return err;
10549 }
10550 
10551 static int
10552 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10553 {
10554 #pragma unused(arg1, arg2)
10555 	uint64_t q;
10556 	int err;
10557 
10558 	q = if_rxpoll_sample_holdtime;
10559 
10560 	err = sysctl_handle_quad(oidp, &q, 0, req);
10561 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10562 		return err;
10563 	}
10564 
10565 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10566 		q = IF_RXPOLL_SAMPLETIME_MIN;
10567 	}
10568 
10569 	if_rxpoll_sample_holdtime = q;
10570 
10571 	return err;
10572 }
10573 
10574 static int
10575 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10576 {
10577 #pragma unused(arg1, arg2)
10578 	uint64_t q;
10579 	int err;
10580 
10581 	q = if_rxpoll_interval_time;
10582 
10583 	err = sysctl_handle_quad(oidp, &q, 0, req);
10584 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10585 		return err;
10586 	}
10587 
10588 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10589 		q = IF_RXPOLL_INTERVALTIME_MIN;
10590 	}
10591 
10592 	if_rxpoll_interval_time = q;
10593 
10594 	return err;
10595 }
10596 
10597 static int
10598 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10599 {
10600 #pragma unused(arg1, arg2)
10601 	uint32_t i;
10602 	int err;
10603 
10604 	i = if_sysctl_rxpoll_wlowat;
10605 
10606 	err = sysctl_handle_int(oidp, &i, 0, req);
10607 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10608 		return err;
10609 	}
10610 
10611 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10612 		return EINVAL;
10613 	}
10614 
10615 	if_sysctl_rxpoll_wlowat = i;
10616 	return err;
10617 }
10618 
10619 static int
10620 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10621 {
10622 #pragma unused(arg1, arg2)
10623 	uint32_t i;
10624 	int err;
10625 
10626 	i = if_sysctl_rxpoll_whiwat;
10627 
10628 	err = sysctl_handle_int(oidp, &i, 0, req);
10629 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10630 		return err;
10631 	}
10632 
10633 	if (i <= if_sysctl_rxpoll_wlowat) {
10634 		return EINVAL;
10635 	}
10636 
10637 	if_sysctl_rxpoll_whiwat = i;
10638 	return err;
10639 }
10640 
10641 static int
10642 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10643 {
10644 #pragma unused(arg1, arg2)
10645 	int i, err;
10646 
10647 	i = if_sndq_maxlen;
10648 
10649 	err = sysctl_handle_int(oidp, &i, 0, req);
10650 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10651 		return err;
10652 	}
10653 
10654 	if (i < IF_SNDQ_MINLEN) {
10655 		i = IF_SNDQ_MINLEN;
10656 	}
10657 
10658 	if_sndq_maxlen = i;
10659 	return err;
10660 }
10661 
10662 static int
10663 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10664 {
10665 #pragma unused(arg1, arg2)
10666 	int i, err;
10667 
10668 	i = if_rcvq_maxlen;
10669 
10670 	err = sysctl_handle_int(oidp, &i, 0, req);
10671 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10672 		return err;
10673 	}
10674 
10675 	if (i < IF_RCVQ_MINLEN) {
10676 		i = IF_RCVQ_MINLEN;
10677 	}
10678 
10679 	if_rcvq_maxlen = i;
10680 	return err;
10681 }
10682 
10683 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10684 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10685     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10686 {
10687 	struct kev_dl_node_presence kev;
10688 	struct sockaddr_dl *sdl;
10689 	struct sockaddr_in6 *sin6;
10690 	int ret = 0;
10691 
10692 	VERIFY(ifp);
10693 	VERIFY(sa);
10694 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10695 
10696 	bzero(&kev, sizeof(kev));
10697 	sin6 = &kev.sin6_node_address;
10698 	sdl = &kev.sdl_node_address;
10699 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10700 	kev.rssi = rssi;
10701 	kev.link_quality_metric = lqm;
10702 	kev.node_proximity_metric = npm;
10703 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10704 
10705 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10706 	if (ret == 0 || ret == EEXIST) {
10707 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10708 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10709 		if (err != 0) {
10710 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10711 			    "error %d\n", __func__, err);
10712 		}
10713 	}
10714 
10715 	if (ret == EEXIST) {
10716 		ret = 0;
10717 	}
10718 	return ret;
10719 }
10720 
10721 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10722 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10723 {
10724 	struct kev_dl_node_absence kev = {};
10725 	struct sockaddr_in6 *kev_sin6 = NULL;
10726 	struct sockaddr_dl *kev_sdl = NULL;
10727 	int error = 0;
10728 
10729 	VERIFY(ifp != NULL);
10730 	VERIFY(sa != NULL);
10731 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10732 
10733 	kev_sin6 = &kev.sin6_node_address;
10734 	kev_sdl = &kev.sdl_node_address;
10735 
10736 	if (sa->sa_family == AF_INET6) {
10737 		/*
10738 		 * If IPv6 address is given, get the link layer
10739 		 * address from what was cached in the neighbor cache
10740 		 */
10741 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10742 		bcopy(sa, kev_sin6, sa->sa_len);
10743 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10744 	} else {
10745 		/*
10746 		 * If passed address is AF_LINK type, derive the address
10747 		 * based on the link address.
10748 		 */
10749 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10750 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10751 	}
10752 
10753 	if (error == 0) {
10754 		kev_sdl->sdl_type = ifp->if_type;
10755 		kev_sdl->sdl_index = ifp->if_index;
10756 
10757 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10758 		    &kev.link_data, sizeof(kev), FALSE);
10759 	}
10760 }
10761 
10762 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10763 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10764     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10765 {
10766 	struct kev_dl_node_presence kev = {};
10767 	struct sockaddr_dl *kev_sdl = NULL;
10768 	struct sockaddr_in6 *kev_sin6 = NULL;
10769 	int ret = 0;
10770 
10771 	VERIFY(ifp != NULL);
10772 	VERIFY(sa != NULL && sdl != NULL);
10773 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10774 
10775 	kev_sin6 = &kev.sin6_node_address;
10776 	kev_sdl = &kev.sdl_node_address;
10777 
10778 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10779 	bcopy(sdl, kev_sdl, sdl->sdl_len);
10780 	kev_sdl->sdl_type = ifp->if_type;
10781 	kev_sdl->sdl_index = ifp->if_index;
10782 
10783 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10784 	bcopy(sa, kev_sin6, sa->sa_len);
10785 
10786 	kev.rssi = rssi;
10787 	kev.link_quality_metric = lqm;
10788 	kev.node_proximity_metric = npm;
10789 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10790 
10791 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10792 	if (ret == 0 || ret == EEXIST) {
10793 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10794 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10795 		if (err != 0) {
10796 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10797 		}
10798 	}
10799 
10800 	if (ret == EEXIST) {
10801 		ret = 0;
10802 	}
10803 	return ret;
10804 }
10805 
10806 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10807 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10808     kauth_cred_t *credp)
10809 {
10810 	const u_int8_t *bytes;
10811 	size_t size;
10812 
10813 	bytes = CONST_LLADDR(sdl);
10814 	size = sdl->sdl_alen;
10815 
10816 #if CONFIG_MACF
10817 	if (dlil_lladdr_ckreq) {
10818 		switch (sdl->sdl_type) {
10819 		case IFT_ETHER:
10820 		case IFT_IEEE1394:
10821 			break;
10822 		default:
10823 			credp = NULL;
10824 			break;
10825 		}
10826 		;
10827 
10828 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10829 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10830 				[0] = 2
10831 			};
10832 
10833 			bytes = unspec;
10834 		}
10835 	}
10836 #else
10837 #pragma unused(credp)
10838 #endif
10839 
10840 	if (sizep != NULL) {
10841 		*sizep = size;
10842 	}
10843 	return bytes;
10844 }
10845 
10846 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10847 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10848     u_int8_t info[DLIL_MODARGLEN])
10849 {
10850 	struct kev_dl_issues kev;
10851 	struct timeval tv;
10852 
10853 	VERIFY(ifp != NULL);
10854 	VERIFY(modid != NULL);
10855 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10856 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10857 
10858 	bzero(&kev, sizeof(kev));
10859 
10860 	microtime(&tv);
10861 	kev.timestamp = tv.tv_sec;
10862 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10863 	if (info != NULL) {
10864 		bcopy(info, &kev.info, DLIL_MODARGLEN);
10865 	}
10866 
10867 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10868 	    &kev.link_data, sizeof(kev), FALSE);
10869 }
10870 
10871 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10872 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10873     struct proc *p)
10874 {
10875 	u_int32_t level = IFNET_THROTTLE_OFF;
10876 	errno_t result = 0;
10877 
10878 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10879 
10880 	if (cmd == SIOCSIFOPPORTUNISTIC) {
10881 		/*
10882 		 * XXX: Use priv_check_cred() instead of root check?
10883 		 */
10884 		if ((result = proc_suser(p)) != 0) {
10885 			return result;
10886 		}
10887 
10888 		if (ifr->ifr_opportunistic.ifo_flags ==
10889 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
10890 			level = IFNET_THROTTLE_OPPORTUNISTIC;
10891 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10892 			level = IFNET_THROTTLE_OFF;
10893 		} else {
10894 			result = EINVAL;
10895 		}
10896 
10897 		if (result == 0) {
10898 			result = ifnet_set_throttle(ifp, level);
10899 		}
10900 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10901 		ifr->ifr_opportunistic.ifo_flags = 0;
10902 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10903 			ifr->ifr_opportunistic.ifo_flags |=
10904 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
10905 		}
10906 	}
10907 
10908 	/*
10909 	 * Return the count of current opportunistic connections
10910 	 * over the interface.
10911 	 */
10912 	if (result == 0) {
10913 		uint32_t flags = 0;
10914 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10915 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
10916 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10917 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10918 		ifr->ifr_opportunistic.ifo_inuse =
10919 		    udp_count_opportunistic(ifp->if_index, flags) +
10920 		    tcp_count_opportunistic(ifp->if_index, flags);
10921 	}
10922 
10923 	if (result == EALREADY) {
10924 		result = 0;
10925 	}
10926 
10927 	return result;
10928 }
10929 
10930 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10931 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10932 {
10933 	struct ifclassq *ifq;
10934 	int err = 0;
10935 
10936 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10937 		return ENXIO;
10938 	}
10939 
10940 	*level = IFNET_THROTTLE_OFF;
10941 
10942 	ifq = ifp->if_snd;
10943 	IFCQ_LOCK(ifq);
10944 	/* Throttling works only for IFCQ, not ALTQ instances */
10945 	if (IFCQ_IS_ENABLED(ifq)) {
10946 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10947 
10948 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10949 		*level = req.level;
10950 	}
10951 	IFCQ_UNLOCK(ifq);
10952 
10953 	return err;
10954 }
10955 
10956 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10957 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10958 {
10959 	struct ifclassq *ifq;
10960 	int err = 0;
10961 
10962 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10963 		return ENXIO;
10964 	}
10965 
10966 	ifq = ifp->if_snd;
10967 
10968 	switch (level) {
10969 	case IFNET_THROTTLE_OFF:
10970 	case IFNET_THROTTLE_OPPORTUNISTIC:
10971 		break;
10972 	default:
10973 		return EINVAL;
10974 	}
10975 
10976 	IFCQ_LOCK(ifq);
10977 	if (IFCQ_IS_ENABLED(ifq)) {
10978 		cqrq_throttle_t req = { 1, level };
10979 
10980 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10981 	}
10982 	IFCQ_UNLOCK(ifq);
10983 
10984 	if (err == 0) {
10985 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
10986 		    level);
10987 #if NECP
10988 		necp_update_all_clients();
10989 #endif /* NECP */
10990 		if (level == IFNET_THROTTLE_OFF) {
10991 			ifnet_start(ifp);
10992 		}
10993 	}
10994 
10995 	return err;
10996 }
10997 
10998 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10999 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11000     struct proc *p)
11001 {
11002 #pragma unused(p)
11003 	errno_t result = 0;
11004 	uint32_t flags;
11005 	int level, category, subcategory;
11006 
11007 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11008 
11009 	if (cmd == SIOCSIFLOG) {
11010 		if ((result = priv_check_cred(kauth_cred_get(),
11011 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11012 			return result;
11013 		}
11014 
11015 		level = ifr->ifr_log.ifl_level;
11016 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11017 			result = EINVAL;
11018 		}
11019 
11020 		flags = ifr->ifr_log.ifl_flags;
11021 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11022 			result = EINVAL;
11023 		}
11024 
11025 		category = ifr->ifr_log.ifl_category;
11026 		subcategory = ifr->ifr_log.ifl_subcategory;
11027 
11028 		if (result == 0) {
11029 			result = ifnet_set_log(ifp, level, flags,
11030 			    category, subcategory);
11031 		}
11032 	} else {
11033 		result = ifnet_get_log(ifp, &level, &flags, &category,
11034 		    &subcategory);
11035 		if (result == 0) {
11036 			ifr->ifr_log.ifl_level = level;
11037 			ifr->ifr_log.ifl_flags = flags;
11038 			ifr->ifr_log.ifl_category = category;
11039 			ifr->ifr_log.ifl_subcategory = subcategory;
11040 		}
11041 	}
11042 
11043 	return result;
11044 }
11045 
11046 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11047 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11048     int32_t category, int32_t subcategory)
11049 {
11050 	int err = 0;
11051 
11052 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11053 	VERIFY(flags & IFNET_LOGF_MASK);
11054 
11055 	/*
11056 	 * The logging level applies to all facilities; make sure to
11057 	 * update them all with the most current level.
11058 	 */
11059 	flags |= ifp->if_log.flags;
11060 
11061 	if (ifp->if_output_ctl != NULL) {
11062 		struct ifnet_log_params l;
11063 
11064 		bzero(&l, sizeof(l));
11065 		l.level = level;
11066 		l.flags = flags;
11067 		l.flags &= ~IFNET_LOGF_DLIL;
11068 		l.category = category;
11069 		l.subcategory = subcategory;
11070 
11071 		/* Send this request to lower layers */
11072 		if (l.flags != 0) {
11073 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11074 			    sizeof(l), &l);
11075 		}
11076 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11077 		/*
11078 		 * If targeted to the lower layers without an output
11079 		 * control callback registered on the interface, just
11080 		 * silently ignore facilities other than ours.
11081 		 */
11082 		flags &= IFNET_LOGF_DLIL;
11083 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11084 			level = 0;
11085 		}
11086 	}
11087 
11088 	if (err == 0) {
11089 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11090 			ifp->if_log.flags = 0;
11091 		} else {
11092 			ifp->if_log.flags |= flags;
11093 		}
11094 
11095 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11096 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11097 		    ifp->if_log.level, ifp->if_log.flags,
11098 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11099 		    category, subcategory);
11100 	}
11101 
11102 	return err;
11103 }
11104 
11105 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11106 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11107     int32_t *category, int32_t *subcategory)
11108 {
11109 	if (level != NULL) {
11110 		*level = ifp->if_log.level;
11111 	}
11112 	if (flags != NULL) {
11113 		*flags = ifp->if_log.flags;
11114 	}
11115 	if (category != NULL) {
11116 		*category = ifp->if_log.category;
11117 	}
11118 	if (subcategory != NULL) {
11119 		*subcategory = ifp->if_log.subcategory;
11120 	}
11121 
11122 	return 0;
11123 }
11124 
11125 int
ifnet_notify_address(struct ifnet * ifp,int af)11126 ifnet_notify_address(struct ifnet *ifp, int af)
11127 {
11128 	struct ifnet_notify_address_params na;
11129 
11130 #if PF
11131 	(void) pf_ifaddr_hook(ifp);
11132 #endif /* PF */
11133 
11134 	if (ifp->if_output_ctl == NULL) {
11135 		return EOPNOTSUPP;
11136 	}
11137 
11138 	bzero(&na, sizeof(na));
11139 	na.address_family = (sa_family_t)af;
11140 
11141 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11142 	           sizeof(na), &na);
11143 }
11144 
11145 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11146 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11147 {
11148 	if (ifp == NULL || flowid == NULL) {
11149 		return EINVAL;
11150 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11151 	    !IF_FULLY_ATTACHED(ifp)) {
11152 		return ENXIO;
11153 	}
11154 
11155 	*flowid = ifp->if_flowhash;
11156 
11157 	return 0;
11158 }
11159 
11160 errno_t
ifnet_disable_output(struct ifnet * ifp)11161 ifnet_disable_output(struct ifnet *ifp)
11162 {
11163 	int err;
11164 
11165 	if (ifp == NULL) {
11166 		return EINVAL;
11167 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11168 	    !IF_FULLY_ATTACHED(ifp)) {
11169 		return ENXIO;
11170 	}
11171 
11172 	if ((err = ifnet_fc_add(ifp)) == 0) {
11173 		lck_mtx_lock_spin(&ifp->if_start_lock);
11174 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11175 		lck_mtx_unlock(&ifp->if_start_lock);
11176 	}
11177 	return err;
11178 }
11179 
11180 errno_t
ifnet_enable_output(struct ifnet * ifp)11181 ifnet_enable_output(struct ifnet *ifp)
11182 {
11183 	if (ifp == NULL) {
11184 		return EINVAL;
11185 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11186 	    !IF_FULLY_ATTACHED(ifp)) {
11187 		return ENXIO;
11188 	}
11189 
11190 	ifnet_start_common(ifp, TRUE);
11191 	return 0;
11192 }
11193 
11194 void
ifnet_flowadv(uint32_t flowhash)11195 ifnet_flowadv(uint32_t flowhash)
11196 {
11197 	struct ifnet_fc_entry *ifce;
11198 	struct ifnet *ifp;
11199 
11200 	ifce = ifnet_fc_get(flowhash);
11201 	if (ifce == NULL) {
11202 		return;
11203 	}
11204 
11205 	VERIFY(ifce->ifce_ifp != NULL);
11206 	ifp = ifce->ifce_ifp;
11207 
11208 	/* flow hash gets recalculated per attach, so check */
11209 	if (ifnet_is_attached(ifp, 1)) {
11210 		if (ifp->if_flowhash == flowhash) {
11211 			(void) ifnet_enable_output(ifp);
11212 		}
11213 		ifnet_decr_iorefcnt(ifp);
11214 	}
11215 	ifnet_fc_entry_free(ifce);
11216 }
11217 
11218 /*
11219  * Function to compare ifnet_fc_entries in ifnet flow control tree
11220  */
11221 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11222 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11223 {
11224 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11225 }
11226 
11227 static int
ifnet_fc_add(struct ifnet * ifp)11228 ifnet_fc_add(struct ifnet *ifp)
11229 {
11230 	struct ifnet_fc_entry keyfc, *ifce;
11231 	uint32_t flowhash;
11232 
11233 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11234 	VERIFY(ifp->if_flowhash != 0);
11235 	flowhash = ifp->if_flowhash;
11236 
11237 	bzero(&keyfc, sizeof(keyfc));
11238 	keyfc.ifce_flowhash = flowhash;
11239 
11240 	lck_mtx_lock_spin(&ifnet_fc_lock);
11241 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11242 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11243 		/* Entry is already in ifnet_fc_tree, return */
11244 		lck_mtx_unlock(&ifnet_fc_lock);
11245 		return 0;
11246 	}
11247 
11248 	if (ifce != NULL) {
11249 		/*
11250 		 * There is a different fc entry with the same flow hash
11251 		 * but different ifp pointer.  There can be a collision
11252 		 * on flow hash but the probability is low.  Let's just
11253 		 * avoid adding a second one when there is a collision.
11254 		 */
11255 		lck_mtx_unlock(&ifnet_fc_lock);
11256 		return EAGAIN;
11257 	}
11258 
11259 	/* become regular mutex */
11260 	lck_mtx_convert_spin(&ifnet_fc_lock);
11261 
11262 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11263 	ifce->ifce_flowhash = flowhash;
11264 	ifce->ifce_ifp = ifp;
11265 
11266 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11267 	lck_mtx_unlock(&ifnet_fc_lock);
11268 	return 0;
11269 }
11270 
11271 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11272 ifnet_fc_get(uint32_t flowhash)
11273 {
11274 	struct ifnet_fc_entry keyfc, *ifce;
11275 	struct ifnet *ifp;
11276 
11277 	bzero(&keyfc, sizeof(keyfc));
11278 	keyfc.ifce_flowhash = flowhash;
11279 
11280 	lck_mtx_lock_spin(&ifnet_fc_lock);
11281 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11282 	if (ifce == NULL) {
11283 		/* Entry is not present in ifnet_fc_tree, return */
11284 		lck_mtx_unlock(&ifnet_fc_lock);
11285 		return NULL;
11286 	}
11287 
11288 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11289 
11290 	VERIFY(ifce->ifce_ifp != NULL);
11291 	ifp = ifce->ifce_ifp;
11292 
11293 	/* become regular mutex */
11294 	lck_mtx_convert_spin(&ifnet_fc_lock);
11295 
11296 	if (!ifnet_is_attached(ifp, 0)) {
11297 		/*
11298 		 * This ifp is not attached or in the process of being
11299 		 * detached; just don't process it.
11300 		 */
11301 		ifnet_fc_entry_free(ifce);
11302 		ifce = NULL;
11303 	}
11304 	lck_mtx_unlock(&ifnet_fc_lock);
11305 
11306 	return ifce;
11307 }
11308 
11309 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11310 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11311 {
11312 	zfree(ifnet_fc_zone, ifce);
11313 }
11314 
11315 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11316 ifnet_calc_flowhash(struct ifnet *ifp)
11317 {
11318 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11319 	uint32_t flowhash = 0;
11320 
11321 	if (ifnet_flowhash_seed == 0) {
11322 		ifnet_flowhash_seed = RandomULong();
11323 	}
11324 
11325 	bzero(&fh, sizeof(fh));
11326 
11327 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11328 	fh.ifk_unit = ifp->if_unit;
11329 	fh.ifk_flags = ifp->if_flags;
11330 	fh.ifk_eflags = ifp->if_eflags;
11331 	fh.ifk_capabilities = ifp->if_capabilities;
11332 	fh.ifk_capenable = ifp->if_capenable;
11333 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11334 	fh.ifk_rand1 = RandomULong();
11335 	fh.ifk_rand2 = RandomULong();
11336 
11337 try_again:
11338 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11339 	if (flowhash == 0) {
11340 		/* try to get a non-zero flowhash */
11341 		ifnet_flowhash_seed = RandomULong();
11342 		goto try_again;
11343 	}
11344 
11345 	return flowhash;
11346 }
11347 
11348 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11349 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11350     uint16_t flags, uint8_t *data)
11351 {
11352 #pragma unused(flags)
11353 	int error = 0;
11354 
11355 	switch (family) {
11356 	case AF_INET:
11357 		if_inetdata_lock_exclusive(ifp);
11358 		if (IN_IFEXTRA(ifp) != NULL) {
11359 			if (len == 0) {
11360 				/* Allow clearing the signature */
11361 				IN_IFEXTRA(ifp)->netsig_len = 0;
11362 				bzero(IN_IFEXTRA(ifp)->netsig,
11363 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11364 				if_inetdata_lock_done(ifp);
11365 				break;
11366 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11367 				error = EINVAL;
11368 				if_inetdata_lock_done(ifp);
11369 				break;
11370 			}
11371 			IN_IFEXTRA(ifp)->netsig_len = len;
11372 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11373 		} else {
11374 			error = ENOMEM;
11375 		}
11376 		if_inetdata_lock_done(ifp);
11377 		break;
11378 
11379 	case AF_INET6:
11380 		if_inet6data_lock_exclusive(ifp);
11381 		if (IN6_IFEXTRA(ifp) != NULL) {
11382 			if (len == 0) {
11383 				/* Allow clearing the signature */
11384 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11385 				bzero(IN6_IFEXTRA(ifp)->netsig,
11386 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11387 				if_inet6data_lock_done(ifp);
11388 				break;
11389 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11390 				error = EINVAL;
11391 				if_inet6data_lock_done(ifp);
11392 				break;
11393 			}
11394 			IN6_IFEXTRA(ifp)->netsig_len = len;
11395 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11396 		} else {
11397 			error = ENOMEM;
11398 		}
11399 		if_inet6data_lock_done(ifp);
11400 		break;
11401 
11402 	default:
11403 		error = EINVAL;
11404 		break;
11405 	}
11406 
11407 	return error;
11408 }
11409 
11410 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11411 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11412     uint16_t *flags, uint8_t *data)
11413 {
11414 	int error = 0;
11415 
11416 	if (ifp == NULL || len == NULL || data == NULL) {
11417 		return EINVAL;
11418 	}
11419 
11420 	switch (family) {
11421 	case AF_INET:
11422 		if_inetdata_lock_shared(ifp);
11423 		if (IN_IFEXTRA(ifp) != NULL) {
11424 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11425 				error = EINVAL;
11426 				if_inetdata_lock_done(ifp);
11427 				break;
11428 			}
11429 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11430 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11431 			} else {
11432 				error = ENOENT;
11433 			}
11434 		} else {
11435 			error = ENOMEM;
11436 		}
11437 		if_inetdata_lock_done(ifp);
11438 		break;
11439 
11440 	case AF_INET6:
11441 		if_inet6data_lock_shared(ifp);
11442 		if (IN6_IFEXTRA(ifp) != NULL) {
11443 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11444 				error = EINVAL;
11445 				if_inet6data_lock_done(ifp);
11446 				break;
11447 			}
11448 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11449 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11450 			} else {
11451 				error = ENOENT;
11452 			}
11453 		} else {
11454 			error = ENOMEM;
11455 		}
11456 		if_inet6data_lock_done(ifp);
11457 		break;
11458 
11459 	default:
11460 		error = EINVAL;
11461 		break;
11462 	}
11463 
11464 	if (error == 0 && flags != NULL) {
11465 		*flags = 0;
11466 	}
11467 
11468 	return error;
11469 }
11470 
11471 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11472 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11473 {
11474 	int i, error = 0, one_set = 0;
11475 
11476 	if_inet6data_lock_exclusive(ifp);
11477 
11478 	if (IN6_IFEXTRA(ifp) == NULL) {
11479 		error = ENOMEM;
11480 		goto out;
11481 	}
11482 
11483 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11484 		uint32_t prefix_len =
11485 		    prefixes[i].prefix_len;
11486 		struct in6_addr *prefix =
11487 		    &prefixes[i].ipv6_prefix;
11488 
11489 		if (prefix_len == 0) {
11490 			clat_log0((LOG_DEBUG,
11491 			    "NAT64 prefixes purged from Interface %s\n",
11492 			    if_name(ifp)));
11493 			/* Allow clearing the signature */
11494 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11495 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11496 			    sizeof(struct in6_addr));
11497 
11498 			continue;
11499 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11500 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11501 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11502 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11503 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11504 		    prefix_len != NAT64_PREFIX_LEN_96) {
11505 			clat_log0((LOG_DEBUG,
11506 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11507 			error = EINVAL;
11508 			goto out;
11509 		}
11510 
11511 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11512 			clat_log0((LOG_DEBUG,
11513 			    "NAT64 prefix has interface/link local scope.\n"));
11514 			error = EINVAL;
11515 			goto out;
11516 		}
11517 
11518 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11519 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11520 		    sizeof(struct in6_addr));
11521 		clat_log0((LOG_DEBUG,
11522 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11523 		    ip6_sprintf(prefix), prefix_len));
11524 		one_set = 1;
11525 	}
11526 
11527 out:
11528 	if_inet6data_lock_done(ifp);
11529 
11530 	if (error == 0 && one_set != 0) {
11531 		necp_update_all_clients();
11532 	}
11533 
11534 	return error;
11535 }
11536 
11537 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11538 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11539 {
11540 	int i, found_one = 0, error = 0;
11541 
11542 	if (ifp == NULL) {
11543 		return EINVAL;
11544 	}
11545 
11546 	if_inet6data_lock_shared(ifp);
11547 
11548 	if (IN6_IFEXTRA(ifp) == NULL) {
11549 		error = ENOMEM;
11550 		goto out;
11551 	}
11552 
11553 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11554 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11555 			found_one = 1;
11556 		}
11557 	}
11558 
11559 	if (found_one == 0) {
11560 		error = ENOENT;
11561 		goto out;
11562 	}
11563 
11564 	if (prefixes) {
11565 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11566 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11567 	}
11568 
11569 out:
11570 	if_inet6data_lock_done(ifp);
11571 
11572 	return error;
11573 }
11574 
11575 __attribute__((noinline))
11576 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11577 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11578     protocol_family_t pf)
11579 {
11580 #pragma unused(ifp)
11581 	uint32_t did_sw;
11582 
11583 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11584 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11585 		return;
11586 	}
11587 
11588 	switch (pf) {
11589 	case PF_INET:
11590 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11591 		if (did_sw & CSUM_DELAY_IP) {
11592 			hwcksum_dbg_finalized_hdr++;
11593 		}
11594 		if (did_sw & CSUM_DELAY_DATA) {
11595 			hwcksum_dbg_finalized_data++;
11596 		}
11597 		break;
11598 	case PF_INET6:
11599 		/*
11600 		 * Checksum offload should not have been enabled when
11601 		 * extension headers exist; that also means that we
11602 		 * cannot force-finalize packets with extension headers.
11603 		 * Indicate to the callee should it skip such case by
11604 		 * setting optlen to -1.
11605 		 */
11606 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11607 		    m->m_pkthdr.csum_flags);
11608 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11609 			hwcksum_dbg_finalized_data++;
11610 		}
11611 		break;
11612 	default:
11613 		return;
11614 	}
11615 }
11616 
11617 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11618 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11619     protocol_family_t pf)
11620 {
11621 	uint16_t sum = 0;
11622 	uint32_t hlen;
11623 
11624 	if (frame_header == NULL ||
11625 	    frame_header < (char *)mbuf_datastart(m) ||
11626 	    frame_header > (char *)m->m_data) {
11627 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11628 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11629 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11630 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11631 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11632 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11633 		return;
11634 	}
11635 	hlen = (uint32_t)(m->m_data - frame_header);
11636 
11637 	switch (pf) {
11638 	case PF_INET:
11639 	case PF_INET6:
11640 		break;
11641 	default:
11642 		return;
11643 	}
11644 
11645 	/*
11646 	 * Force partial checksum offload; useful to simulate cases
11647 	 * where the hardware does not support partial checksum offload,
11648 	 * in order to validate correctness throughout the layers above.
11649 	 */
11650 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11651 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11652 
11653 		if (foff > (uint32_t)m->m_pkthdr.len) {
11654 			return;
11655 		}
11656 
11657 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11658 
11659 		/* Compute 16-bit 1's complement sum from forced offset */
11660 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11661 
11662 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11663 		m->m_pkthdr.csum_rx_val = sum;
11664 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11665 
11666 		hwcksum_dbg_partial_forced++;
11667 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11668 	}
11669 
11670 	/*
11671 	 * Partial checksum offload verification (and adjustment);
11672 	 * useful to validate and test cases where the hardware
11673 	 * supports partial checksum offload.
11674 	 */
11675 	if ((m->m_pkthdr.csum_flags &
11676 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11677 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11678 		uint32_t rxoff;
11679 
11680 		/* Start offset must begin after frame header */
11681 		rxoff = m->m_pkthdr.csum_rx_start;
11682 		if (hlen > rxoff) {
11683 			hwcksum_dbg_bad_rxoff++;
11684 			if (dlil_verbose) {
11685 				DLIL_PRINTF("%s: partial cksum start offset %d "
11686 				    "is less than frame header length %d for "
11687 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11688 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11689 			}
11690 			return;
11691 		}
11692 		rxoff -= hlen;
11693 
11694 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11695 			/*
11696 			 * Compute the expected 16-bit 1's complement sum;
11697 			 * skip this if we've already computed it above
11698 			 * when partial checksum offload is forced.
11699 			 */
11700 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11701 
11702 			/* Hardware or driver is buggy */
11703 			if (sum != m->m_pkthdr.csum_rx_val) {
11704 				hwcksum_dbg_bad_cksum++;
11705 				if (dlil_verbose) {
11706 					DLIL_PRINTF("%s: bad partial cksum value "
11707 					    "0x%x (expected 0x%x) for mbuf "
11708 					    "0x%llx [rx_start %d]\n",
11709 					    if_name(ifp),
11710 					    m->m_pkthdr.csum_rx_val, sum,
11711 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11712 					    m->m_pkthdr.csum_rx_start);
11713 				}
11714 				return;
11715 			}
11716 		}
11717 		hwcksum_dbg_verified++;
11718 
11719 		/*
11720 		 * This code allows us to emulate various hardwares that
11721 		 * perform 16-bit 1's complement sum beginning at various
11722 		 * start offset values.
11723 		 */
11724 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11725 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11726 
11727 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11728 				return;
11729 			}
11730 
11731 			sum = m_adj_sum16(m, rxoff, aoff,
11732 			    m_pktlen(m) - aoff, sum);
11733 
11734 			m->m_pkthdr.csum_rx_val = sum;
11735 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11736 
11737 			hwcksum_dbg_adjusted++;
11738 		}
11739 	}
11740 }
11741 
11742 static int
11743 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11744 {
11745 #pragma unused(arg1, arg2)
11746 	u_int32_t i;
11747 	int err;
11748 
11749 	i = hwcksum_dbg_mode;
11750 
11751 	err = sysctl_handle_int(oidp, &i, 0, req);
11752 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11753 		return err;
11754 	}
11755 
11756 	if (hwcksum_dbg == 0) {
11757 		return ENODEV;
11758 	}
11759 
11760 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
11761 		return EINVAL;
11762 	}
11763 
11764 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
11765 
11766 	return err;
11767 }
11768 
11769 static int
11770 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
11771 {
11772 #pragma unused(arg1, arg2)
11773 	u_int32_t i;
11774 	int err;
11775 
11776 	i = hwcksum_dbg_partial_rxoff_forced;
11777 
11778 	err = sysctl_handle_int(oidp, &i, 0, req);
11779 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11780 		return err;
11781 	}
11782 
11783 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11784 		return ENODEV;
11785 	}
11786 
11787 	hwcksum_dbg_partial_rxoff_forced = i;
11788 
11789 	return err;
11790 }
11791 
11792 static int
11793 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
11794 {
11795 #pragma unused(arg1, arg2)
11796 	u_int32_t i;
11797 	int err;
11798 
11799 	i = hwcksum_dbg_partial_rxoff_adj;
11800 
11801 	err = sysctl_handle_int(oidp, &i, 0, req);
11802 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11803 		return err;
11804 	}
11805 
11806 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
11807 		return ENODEV;
11808 	}
11809 
11810 	hwcksum_dbg_partial_rxoff_adj = i;
11811 
11812 	return err;
11813 }
11814 
11815 static int
11816 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
11817 {
11818 #pragma unused(oidp, arg1, arg2)
11819 	int err;
11820 
11821 	if (req->oldptr == USER_ADDR_NULL) {
11822 	}
11823 	if (req->newptr != USER_ADDR_NULL) {
11824 		return EPERM;
11825 	}
11826 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
11827 	    sizeof(struct chain_len_stats));
11828 
11829 	return err;
11830 }
11831 
11832 
11833 #if DEBUG || DEVELOPMENT
11834 /* Blob for sum16 verification */
11835 static uint8_t sumdata[] = {
11836 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11837 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11838 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11839 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11840 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11841 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11842 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11843 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11844 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11845 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11846 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11847 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11848 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11849 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11850 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11851 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11852 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11853 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11854 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11855 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11856 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11857 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11858 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11859 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11860 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11861 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11862 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11863 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11864 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11865 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11866 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11867 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11868 	0xc8, 0x28, 0x02, 0x00, 0x00
11869 };
11870 
11871 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11872 static struct {
11873 	boolean_t       init;
11874 	uint16_t        len;
11875 	uint16_t        sumr;   /* reference */
11876 	uint16_t        sumrp;  /* reference, precomputed */
11877 } sumtbl[] = {
11878 	{ FALSE, 0, 0, 0x0000 },
11879 	{ FALSE, 1, 0, 0x001f },
11880 	{ FALSE, 2, 0, 0x8b1f },
11881 	{ FALSE, 3, 0, 0x8b27 },
11882 	{ FALSE, 7, 0, 0x790e },
11883 	{ FALSE, 11, 0, 0xcb6d },
11884 	{ FALSE, 20, 0, 0x20dd },
11885 	{ FALSE, 27, 0, 0xbabd },
11886 	{ FALSE, 32, 0, 0xf3e8 },
11887 	{ FALSE, 37, 0, 0x197d },
11888 	{ FALSE, 43, 0, 0x9eae },
11889 	{ FALSE, 64, 0, 0x4678 },
11890 	{ FALSE, 127, 0, 0x9399 },
11891 	{ FALSE, 256, 0, 0xd147 },
11892 	{ FALSE, 325, 0, 0x0358 },
11893 };
11894 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11895 
11896 static void
dlil_verify_sum16(void)11897 dlil_verify_sum16(void)
11898 {
11899 	struct mbuf *m;
11900 	uint8_t *buf;
11901 	int n;
11902 
11903 	/* Make sure test data plus extra room for alignment fits in cluster */
11904 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11905 
11906 	kprintf("DLIL: running SUM16 self-tests ... ");
11907 
11908 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11909 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11910 
11911 	buf = mtod(m, uint8_t *);               /* base address */
11912 
11913 	for (n = 0; n < SUMTBL_MAX; n++) {
11914 		uint16_t len = sumtbl[n].len;
11915 		int i;
11916 
11917 		/* Verify for all possible alignments */
11918 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
11919 			uint16_t sum, sumr;
11920 			uint8_t *c;
11921 
11922 			/* Copy over test data to mbuf */
11923 			VERIFY(len <= sizeof(sumdata));
11924 			c = buf + i;
11925 			bcopy(sumdata, c, len);
11926 
11927 			/* Zero-offset test (align by data pointer) */
11928 			m->m_data = (caddr_t)c;
11929 			m->m_len = len;
11930 			sum = m_sum16(m, 0, len);
11931 
11932 			if (!sumtbl[n].init) {
11933 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11934 				sumtbl[n].sumr = sumr;
11935 				sumtbl[n].init = TRUE;
11936 			} else {
11937 				sumr = sumtbl[n].sumr;
11938 			}
11939 
11940 			/* Something is horribly broken; stop now */
11941 			if (sumr != sumtbl[n].sumrp) {
11942 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11943 				    "for len=%d align=%d sum=0x%04x "
11944 				    "[expected=0x%04x]\n", __func__,
11945 				    len, i, sum, sumr);
11946 				/* NOTREACHED */
11947 			} else if (sum != sumr) {
11948 				panic_plain("\n%s: broken m_sum16() for len=%d "
11949 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11950 				    __func__, len, i, sum, sumr);
11951 				/* NOTREACHED */
11952 			}
11953 
11954 			/* Alignment test by offset (fixed data pointer) */
11955 			m->m_data = (caddr_t)buf;
11956 			m->m_len = i + len;
11957 			sum = m_sum16(m, i, len);
11958 
11959 			/* Something is horribly broken; stop now */
11960 			if (sum != sumr) {
11961 				panic_plain("\n%s: broken m_sum16() for len=%d "
11962 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
11963 				    __func__, len, i, sum, sumr);
11964 				/* NOTREACHED */
11965 			}
11966 #if INET
11967 			/* Simple sum16 contiguous buffer test by aligment */
11968 			sum = b_sum16(c, len);
11969 
11970 			/* Something is horribly broken; stop now */
11971 			if (sum != sumr) {
11972 				panic_plain("\n%s: broken b_sum16() for len=%d "
11973 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11974 				    __func__, len, i, sum, sumr);
11975 				/* NOTREACHED */
11976 			}
11977 #endif /* INET */
11978 		}
11979 	}
11980 	m_freem(m);
11981 
11982 	kprintf("PASSED\n");
11983 }
11984 #endif /* DEBUG || DEVELOPMENT */
11985 
11986 #define CASE_STRINGIFY(x) case x: return #x
11987 
11988 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11989 dlil_kev_dl_code_str(u_int32_t event_code)
11990 {
11991 	switch (event_code) {
11992 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11993 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11994 		CASE_STRINGIFY(KEV_DL_SIFMTU);
11995 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
11996 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11997 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11998 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
11999 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12000 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12001 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12002 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12003 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12004 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12005 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12006 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12007 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12008 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12009 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12010 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12011 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12012 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12013 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12014 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12015 		CASE_STRINGIFY(KEV_DL_ISSUES);
12016 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12017 	default:
12018 		break;
12019 	}
12020 	return "";
12021 }
12022 
12023 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12024 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12025 {
12026 #pragma unused(arg1)
12027 	struct ifnet *ifp = arg0;
12028 
12029 	if (ifnet_is_attached(ifp, 1)) {
12030 		nstat_ifnet_threshold_reached(ifp->if_index);
12031 		ifnet_decr_iorefcnt(ifp);
12032 	}
12033 }
12034 
12035 void
ifnet_notify_data_threshold(struct ifnet * ifp)12036 ifnet_notify_data_threshold(struct ifnet *ifp)
12037 {
12038 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12039 	uint64_t oldbytes = ifp->if_dt_bytes;
12040 
12041 	ASSERT(ifp->if_dt_tcall != NULL);
12042 
12043 	/*
12044 	 * If we went over the threshold, notify NetworkStatistics.
12045 	 * We rate-limit it based on the threshold interval value.
12046 	 */
12047 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12048 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12049 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12050 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12051 		uint64_t now = mach_absolute_time(), deadline = now;
12052 		uint64_t ival;
12053 
12054 		if (tival != 0) {
12055 			nanoseconds_to_absolutetime(tival, &ival);
12056 			clock_deadline_for_periodic_event(ival, now, &deadline);
12057 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12058 			    deadline);
12059 		} else {
12060 			(void) thread_call_enter(ifp->if_dt_tcall);
12061 		}
12062 	}
12063 }
12064 
12065 #if (DEVELOPMENT || DEBUG)
12066 /*
12067  * The sysctl variable name contains the input parameters of
12068  * ifnet_get_keepalive_offload_frames()
12069  *  ifp (interface index): name[0]
12070  *  frames_array_count:    name[1]
12071  *  frame_data_offset:     name[2]
12072  * The return length gives used_frames_count
12073  */
12074 static int
12075 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12076 {
12077 #pragma unused(oidp)
12078 	int *name = (int *)arg1;
12079 	u_int namelen = arg2;
12080 	int idx;
12081 	ifnet_t ifp = NULL;
12082 	u_int32_t frames_array_count;
12083 	size_t frame_data_offset;
12084 	u_int32_t used_frames_count;
12085 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12086 	int error = 0;
12087 	u_int32_t i;
12088 
12089 	/*
12090 	 * Only root can get look at other people TCP frames
12091 	 */
12092 	error = proc_suser(current_proc());
12093 	if (error != 0) {
12094 		goto done;
12095 	}
12096 	/*
12097 	 * Validate the input parameters
12098 	 */
12099 	if (req->newptr != USER_ADDR_NULL) {
12100 		error = EPERM;
12101 		goto done;
12102 	}
12103 	if (namelen != 3) {
12104 		error = EINVAL;
12105 		goto done;
12106 	}
12107 	if (req->oldptr == USER_ADDR_NULL) {
12108 		error = EINVAL;
12109 		goto done;
12110 	}
12111 	if (req->oldlen == 0) {
12112 		error = EINVAL;
12113 		goto done;
12114 	}
12115 	idx = name[0];
12116 	frames_array_count = name[1];
12117 	frame_data_offset = name[2];
12118 
12119 	/* Make sure the passed buffer is large enough */
12120 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12121 	    req->oldlen) {
12122 		error = ENOMEM;
12123 		goto done;
12124 	}
12125 
12126 	ifnet_head_lock_shared();
12127 	if (!IF_INDEX_IN_RANGE(idx)) {
12128 		ifnet_head_done();
12129 		error = ENOENT;
12130 		goto done;
12131 	}
12132 	ifp = ifindex2ifnet[idx];
12133 	ifnet_head_done();
12134 
12135 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12136 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12137 		Z_WAITOK);
12138 	if (frames_array == NULL) {
12139 		error = ENOMEM;
12140 		goto done;
12141 	}
12142 
12143 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12144 	    frames_array_count, frame_data_offset, &used_frames_count);
12145 	if (error != 0) {
12146 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12147 		    __func__, error);
12148 		goto done;
12149 	}
12150 
12151 	for (i = 0; i < used_frames_count; i++) {
12152 		error = SYSCTL_OUT(req, frames_array + i,
12153 		    sizeof(struct ifnet_keepalive_offload_frame));
12154 		if (error != 0) {
12155 			goto done;
12156 		}
12157 	}
12158 done:
12159 	if (frames_array != NULL) {
12160 		kfree_data(frames_array, frames_array_count *
12161 		    sizeof(struct ifnet_keepalive_offload_frame));
12162 	}
12163 	return error;
12164 }
12165 #endif /* DEVELOPMENT || DEBUG */
12166 
12167 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12168 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12169     struct ifnet *ifp)
12170 {
12171 	tcp_update_stats_per_flow(ifs, ifp);
12172 }
12173 
12174 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12175 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12176 {
12177 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12178 }
12179 
12180 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12181 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12182 {
12183 	OSBitAndAtomic(~clear_flags, flags_p);
12184 }
12185 
12186 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12187 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12188 {
12189 	return _set_flags(&interface->if_eflags, set_flags);
12190 }
12191 
12192 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12193 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12194 {
12195 	_clear_flags(&interface->if_eflags, clear_flags);
12196 }
12197 
12198 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12199 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12200 {
12201 	return _set_flags(&interface->if_xflags, set_flags);
12202 }
12203 
12204 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12205 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12206 {
12207 	_clear_flags(&interface->if_xflags, clear_flags);
12208 }
12209 
12210 static void
log_hexdump(void * data,size_t len)12211 log_hexdump(void *data, size_t len)
12212 {
12213 	size_t i, j, k;
12214 	unsigned char *ptr = (unsigned char *)data;
12215 #define MAX_DUMP_BUF 32
12216 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12217 
12218 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12219 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12220 			unsigned char msnbl = ptr[j] >> 4;
12221 			unsigned char lsnbl = ptr[j] & 0x0f;
12222 
12223 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12224 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12225 
12226 			if ((j % 2) == 1) {
12227 				buf[k++] = ' ';
12228 			}
12229 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12230 				buf[k++] = ' ';
12231 			}
12232 		}
12233 		buf[k] = 0;
12234 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12235 	}
12236 }
12237 
12238 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
12239 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12240 net_check_compatible_if_filter(struct ifnet *ifp)
12241 {
12242 	if (ifp == NULL) {
12243 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12244 			return false;
12245 		}
12246 	} else {
12247 		if (ifp->if_flt_non_os_count > 0) {
12248 			return false;
12249 		}
12250 	}
12251 	return true;
12252 }
12253 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12254 
12255 #define DUMP_BUF_CHK() {        \
12256 	clen -= k;              \
12257 	if (clen < 1)           \
12258 	        goto done;      \
12259 	c += k;                 \
12260 }
12261 
12262 int dlil_dump_top_if_qlen(char *, int);
12263 int
dlil_dump_top_if_qlen(char * str,int str_len)12264 dlil_dump_top_if_qlen(char *str, int str_len)
12265 {
12266 	char *c = str;
12267 	int k, clen = str_len;
12268 	struct ifnet *top_ifcq_ifp = NULL;
12269 	uint32_t top_ifcq_len = 0;
12270 	struct ifnet *top_inq_ifp = NULL;
12271 	uint32_t top_inq_len = 0;
12272 
12273 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12274 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12275 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12276 
12277 		if (ifp == NULL) {
12278 			continue;
12279 		}
12280 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12281 			top_ifcq_len = ifp->if_snd->ifcq_len;
12282 			top_ifcq_ifp = ifp;
12283 		}
12284 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12285 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12286 			top_inq_ifp = ifp;
12287 		}
12288 	}
12289 
12290 	if (top_ifcq_ifp != NULL) {
12291 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12292 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12293 		DUMP_BUF_CHK();
12294 	}
12295 	if (top_inq_ifp != NULL) {
12296 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12297 		    top_inq_len, top_inq_ifp->if_xname);
12298 		DUMP_BUF_CHK();
12299 	}
12300 done:
12301 	return str_len - clen;
12302 }
12303