xref: /xnu-8792.61.2/bsd/net/dlil.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 1999-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define IFNET_KTRACE_TX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x002)
154 
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR        4 /* LONGWORDS */
157 
158 
159 #if 1
160 #define DLIL_PRINTF     printf
161 #else
162 #define DLIL_PRINTF     kprintf
163 #endif
164 
165 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
166 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167 
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
169 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170 
171 enum {
172 	kProtoKPI_v1    = 1,
173 	kProtoKPI_v2    = 2
174 };
175 
176 /*
177  * List of if_proto structures in if_proto_hash[] is protected by
178  * the ifnet lock.  The rest of the fields are initialized at protocol
179  * attach time and never change, thus no lock required as long as
180  * a reference to it is valid, via if_proto_ref().
181  */
182 struct if_proto {
183 	SLIST_ENTRY(if_proto)       next_hash;
184 	u_int32_t                   refcount;
185 	u_int32_t                   detached;
186 	struct ifnet                *ifp;
187 	protocol_family_t           protocol_family;
188 	int                         proto_kpi;
189 	union {
190 		struct {
191 			proto_media_input               input;
192 			proto_media_preout              pre_output;
193 			proto_media_event               event;
194 			proto_media_ioctl               ioctl;
195 			proto_media_detached            detached;
196 			proto_media_resolve_multi       resolve_multi;
197 			proto_media_send_arp            send_arp;
198 		} v1;
199 		struct {
200 			proto_media_input_v2            input;
201 			proto_media_preout              pre_output;
202 			proto_media_event               event;
203 			proto_media_ioctl               ioctl;
204 			proto_media_detached            detached;
205 			proto_media_resolve_multi       resolve_multi;
206 			proto_media_send_arp            send_arp;
207 		} v2;
208 	} kpi;
209 };
210 
211 SLIST_HEAD(proto_hash_entry, if_proto);
212 
213 #define DLIL_SDLDATALEN \
214 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215 
216 struct dlil_ifnet {
217 	struct ifnet    dl_if;                  /* public ifnet */
218 	/*
219 	 * DLIL private fields, protected by dl_if_lock
220 	 */
221 	decl_lck_mtx_data(, dl_if_lock);
222 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
223 	u_int32_t dl_if_flags;                  /* flags (below) */
224 	u_int32_t dl_if_refcnt;                 /* refcnt */
225 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 	void    *dl_if_uniqueid;                /* unique interface id */
227 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
228 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
229 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
230 	struct {
231 		struct ifaddr   ifa;            /* lladdr ifa */
232 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 	} dl_if_lladdr;
235 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 	u_int8_t dl_if_permanent_ether_is_set;
238 	u_int8_t dl_if_unused;
239 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
241 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
242 };
243 
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG      0x4     /* has debugging info */
248 
249 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
250 
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253 
254 struct dlil_ifnet_dbg {
255 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
256 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
257 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
258 	/*
259 	 * Circular lists of ifnet_{reference,release} callers.
260 	 */
261 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264 
265 #define DLIL_TO_IFP(s)  (&s->dl_if)
266 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
267 
268 struct ifnet_filter {
269 	TAILQ_ENTRY(ifnet_filter)       filt_next;
270 	u_int32_t                       filt_skip;
271 	u_int32_t                       filt_flags;
272 	ifnet_t                         filt_ifp;
273 	const char                      *filt_name;
274 	void                            *filt_cookie;
275 	protocol_family_t               filt_protocol;
276 	iff_input_func                  filt_input;
277 	iff_output_func                 filt_output;
278 	iff_event_func                  filt_event;
279 	iff_ioctl_func                  filt_ioctl;
280 	iff_detached_func               filt_detached;
281 };
282 
283 struct proto_input_entry;
284 
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286 
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288 
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294 
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297     &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299     &dlil_lck_attributes);
300 
301 #if DEBUG
302 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug;        /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
308 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
310 
311 static ZONE_DEFINE(dlif_filt_zone, "ifnet_filter",
312     sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
313 
314 static ZONE_DEFINE(dlif_phash_zone, "ifnet_proto_hash",
315     sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
316 
317 static ZONE_DEFINE(dlif_proto_zone, "ifnet_proto",
318     sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
319 
320 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
321 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
322 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
323 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
324 
325 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
326 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
327 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
328 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
329 
330 static u_int32_t net_rtref;
331 
332 static struct dlil_main_threading_info dlil_main_input_thread_info;
333 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
334     (struct dlil_threading_info *)&dlil_main_input_thread_info;
335 
336 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
337 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
338 static void dlil_if_trace(struct dlil_ifnet *, int);
339 static void if_proto_ref(struct if_proto *);
340 static void if_proto_free(struct if_proto *);
341 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
342 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
343     u_int32_t list_count);
344 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
345 static void if_flt_monitor_busy(struct ifnet *);
346 static void if_flt_monitor_unbusy(struct ifnet *);
347 static void if_flt_monitor_enter(struct ifnet *);
348 static void if_flt_monitor_leave(struct ifnet *);
349 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
350     char **, protocol_family_t);
351 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
352     protocol_family_t);
353 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
354     const struct sockaddr_dl *);
355 static int ifnet_lookup(struct ifnet *);
356 static void if_purgeaddrs(struct ifnet *);
357 
358 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
359     struct mbuf *, char *);
360 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
361     struct mbuf *);
362 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
363     mbuf_t *, const struct sockaddr *, void *, char *, char *);
364 static void ifproto_media_event(struct ifnet *, protocol_family_t,
365     const struct kev_msg *);
366 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
367     unsigned long, void *);
368 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
369     struct sockaddr_dl *, size_t);
370 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
371     const struct sockaddr_dl *, const struct sockaddr *,
372     const struct sockaddr_dl *, const struct sockaddr *);
373 
374 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
375     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
376     boolean_t poll, struct thread *tp);
377 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
378     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
379 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
380 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
381     protocol_family_t *);
382 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
383     const struct ifnet_demux_desc *, u_int32_t);
384 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
385 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
386 #if !XNU_TARGET_OS_OSX
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388     const struct sockaddr *, const char *, const char *,
389     u_int32_t *, u_int32_t *);
390 #else /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
392     const struct sockaddr *, const char *, const char *);
393 #endif /* XNU_TARGET_OS_OSX */
394 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
395     const struct sockaddr *, const char *, const char *,
396     u_int32_t *, u_int32_t *);
397 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
398 static void ifp_if_free(struct ifnet *);
399 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
400 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
401 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
402 
403 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
404     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405     boolean_t, struct thread *);
406 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
407     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
408     boolean_t, struct thread *);
409 
410 static void dlil_main_input_thread_func(void *, wait_result_t);
411 static void dlil_main_input_thread_cont(void *, wait_result_t);
412 
413 static void dlil_input_thread_func(void *, wait_result_t);
414 static void dlil_input_thread_cont(void *, wait_result_t);
415 
416 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
417 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
418 
419 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
420     thread_continue_t *);
421 static void dlil_terminate_input_thread(struct dlil_threading_info *);
422 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
423     struct dlil_threading_info *, struct ifnet *, boolean_t);
424 static boolean_t dlil_input_stats_sync(struct ifnet *,
425     struct dlil_threading_info *);
426 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
427     u_int32_t, ifnet_model_t, boolean_t);
428 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
429     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
430 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
431 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
432 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
433 #if DEBUG || DEVELOPMENT
434 static void dlil_verify_sum16(void);
435 #endif /* DEBUG || DEVELOPMENT */
436 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
437     protocol_family_t);
438 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
439     protocol_family_t);
440 
441 static void dlil_incr_pending_thread_count(void);
442 static void dlil_decr_pending_thread_count(void);
443 
444 static void ifnet_detacher_thread_func(void *, wait_result_t);
445 static void ifnet_detacher_thread_cont(void *, wait_result_t);
446 static void ifnet_detach_final(struct ifnet *);
447 static void ifnet_detaching_enqueue(struct ifnet *);
448 static struct ifnet *ifnet_detaching_dequeue(void);
449 
450 static void ifnet_start_thread_func(void *, wait_result_t);
451 static void ifnet_start_thread_cont(void *, wait_result_t);
452 
453 static void ifnet_poll_thread_func(void *, wait_result_t);
454 static void ifnet_poll_thread_cont(void *, wait_result_t);
455 
456 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
457     classq_pkt_t *, boolean_t, boolean_t *);
458 
459 static void ifp_src_route_copyout(struct ifnet *, struct route *);
460 static void ifp_src_route_copyin(struct ifnet *, struct route *);
461 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
462 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
463 
464 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
470 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
471 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
473 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
474 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
475 
476 struct chain_len_stats tx_chain_len_stats;
477 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
478 
479 #if TEST_INPUT_THREAD_TERMINATION
480 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
481 #endif /* TEST_INPUT_THREAD_TERMINATION */
482 
483 
484 /* The following are protected by dlil_ifnet_lock */
485 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
486 static u_int32_t ifnet_detaching_cnt;
487 static boolean_t ifnet_detaching_embryonic;
488 static void *ifnet_delayed_run; /* wait channel for detaching thread */
489 
490 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
491     &dlil_lck_attributes);
492 
493 static uint32_t ifnet_flowhash_seed;
494 
495 struct ifnet_flowhash_key {
496 	char            ifk_name[IFNAMSIZ];
497 	uint32_t        ifk_unit;
498 	uint32_t        ifk_flags;
499 	uint32_t        ifk_eflags;
500 	uint32_t        ifk_capabilities;
501 	uint32_t        ifk_capenable;
502 	uint32_t        ifk_output_sched_model;
503 	uint32_t        ifk_rand1;
504 	uint32_t        ifk_rand2;
505 };
506 
507 /* Flow control entry per interface */
508 struct ifnet_fc_entry {
509 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
510 	u_int32_t       ifce_flowhash;
511 	struct ifnet    *ifce_ifp;
512 };
513 
514 static uint32_t ifnet_calc_flowhash(struct ifnet *);
515 static int ifce_cmp(const struct ifnet_fc_entry *,
516     const struct ifnet_fc_entry *);
517 static int ifnet_fc_add(struct ifnet *);
518 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
519 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
520 
521 /* protected by ifnet_fc_lock */
522 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
523 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
524 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
525 
526 static ZONE_DEFINE(ifnet_fc_zone, "ifnet_fc_zone",
527     sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
528 
529 extern void bpfdetach(struct ifnet *);
530 extern void proto_input_run(void);
531 
532 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
533     u_int32_t flags);
534 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
535     u_int32_t flags);
536 
537 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
538 
539 #if CONFIG_MACF
540 #if !XNU_TARGET_OS_OSX
541 int dlil_lladdr_ckreq = 1;
542 #else /* XNU_TARGET_OS_OSX */
543 int dlil_lladdr_ckreq = 0;
544 #endif /* XNU_TARGET_OS_OSX */
545 #endif /* CONFIG_MACF */
546 
547 #if DEBUG
548 int dlil_verbose = 1;
549 #else
550 int dlil_verbose = 0;
551 #endif /* DEBUG */
552 #if IFNET_INPUT_SANITY_CHK
553 /* sanity checking of input packet lists received */
554 static u_int32_t dlil_input_sanity_check = 0;
555 #endif /* IFNET_INPUT_SANITY_CHK */
556 /* rate limit debug messages */
557 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
558 
559 SYSCTL_DECL(_net_link_generic_system);
560 
561 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
562     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
563 
564 #define IF_SNDQ_MINLEN  32
565 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
566 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
567     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
568     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
569 
570 #define IF_RCVQ_MINLEN  32
571 #define IF_RCVQ_MAXLEN  256
572 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
573 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
574     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
575     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
576 
577 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
578 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
579 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
580     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
581     "ilog2 of EWMA decay rate of avg inbound packets");
582 
583 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
584 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
585 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
586 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
587     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
588     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
589     "Q", "input poll mode freeze time");
590 
591 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
592 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
593 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
595     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
596     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
597     "Q", "input poll sampling time");
598 
599 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
600 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
601     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
602     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
603     "Q", "input poll interval (time)");
604 
605 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
606 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
607 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
608     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
609     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
610 
611 #define IF_RXPOLL_WLOWAT        10
612 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
613 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
614     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
615     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
616     "I", "input poll wakeup low watermark");
617 
618 #define IF_RXPOLL_WHIWAT        100
619 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
620 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
621     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
622     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
623     "I", "input poll wakeup high watermark");
624 
625 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
626 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
627     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
628     "max packets per poll call");
629 
630 u_int32_t if_rxpoll = 1;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
632     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
633     sysctl_rxpoll, "I", "enable opportunistic input polling");
634 
635 #if TEST_INPUT_THREAD_TERMINATION
636 static u_int32_t if_input_thread_termination_spin = 0;
637 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
638     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
639     &if_input_thread_termination_spin, 0,
640     sysctl_input_thread_termination_spin,
641     "I", "input thread termination spin limit");
642 #endif /* TEST_INPUT_THREAD_TERMINATION */
643 
644 static u_int32_t cur_dlil_input_threads = 0;
645 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
646     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
647     "Current number of DLIL input threads");
648 
649 #if IFNET_INPUT_SANITY_CHK
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
651     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
652     "Turn on sanity checking in DLIL input");
653 #endif /* IFNET_INPUT_SANITY_CHK */
654 
655 static u_int32_t if_flowadv = 1;
656 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
657     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
658     "enable flow-advisory mechanism");
659 
660 static u_int32_t if_delaybased_queue = 1;
661 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
662     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
663     "enable delay based dynamic queue sizing");
664 
665 static uint64_t hwcksum_in_invalidated = 0;
666 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
667     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
668     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
669 
670 uint32_t hwcksum_dbg = 0;
671 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
672     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
673     "enable hardware cksum debugging");
674 
675 u_int32_t ifnet_start_delayed = 0;
676 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
677     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
678     "number of times start was delayed");
679 
680 u_int32_t ifnet_delay_start_disabled = 0;
681 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
682     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
683     "number of times start was delayed");
684 
685 #if DEVELOPMENT || DEBUG
686 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
687 
688 struct flow_key flow_key_trace;
689 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
690     CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
691 #endif /* DEVELOPMENT || DEBUG */
692 
693 static inline void
ifnet_delay_start_disabled_increment(void)694 ifnet_delay_start_disabled_increment(void)
695 {
696 	OSIncrementAtomic(&ifnet_delay_start_disabled);
697 }
698 
699 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
700 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
701 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
702 #define HWCKSUM_DBG_MASK \
703 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
704 	HWCKSUM_DBG_FINALIZE_FORCED)
705 
706 static uint32_t hwcksum_dbg_mode = 0;
707 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
708     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
709     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
710 
711 static uint64_t hwcksum_dbg_partial_forced = 0;
712 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
713     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
714     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
715 
716 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
717 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
718     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
719     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
720 
721 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
722 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
723     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
724     &hwcksum_dbg_partial_rxoff_forced, 0,
725     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
726     "forced partial cksum rx offset");
727 
728 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
729 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
730     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
731     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
732     "adjusted partial cksum rx offset");
733 
734 static uint64_t hwcksum_dbg_verified = 0;
735 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
736     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
737     &hwcksum_dbg_verified, "packets verified for having good checksum");
738 
739 static uint64_t hwcksum_dbg_bad_cksum = 0;
740 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
741     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
742     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
743 
744 static uint64_t hwcksum_dbg_bad_rxoff = 0;
745 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
746     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
747     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
748 
749 static uint64_t hwcksum_dbg_adjusted = 0;
750 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
751     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
752     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
753 
754 static uint64_t hwcksum_dbg_finalized_hdr = 0;
755 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
756     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
757     &hwcksum_dbg_finalized_hdr, "finalized headers");
758 
759 static uint64_t hwcksum_dbg_finalized_data = 0;
760 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
761     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
762     &hwcksum_dbg_finalized_data, "finalized payloads");
763 
764 uint32_t hwcksum_tx = 1;
765 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
766     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
767     "enable transmit hardware checksum offload");
768 
769 uint32_t hwcksum_rx = 1;
770 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
771     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
772     "enable receive hardware checksum offload");
773 
774 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
775     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
776     sysctl_tx_chain_len_stats, "S", "");
777 
778 uint32_t tx_chain_len_count = 0;
779 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
780     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
781 
782 static uint32_t threshold_notify = 1;           /* enable/disable */
783 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
784     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
785 
786 static uint32_t threshold_interval = 2;         /* in seconds */
787 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
788     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
789 
790 #if (DEVELOPMENT || DEBUG)
791 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
792 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
793     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
794 #endif /* DEVELOPMENT || DEBUG */
795 
796 struct net_api_stats net_api_stats;
797 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
798     &net_api_stats, net_api_stats, "");
799 
800 uint32_t net_wake_pkt_debug = 0;
801 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
802     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
803 
804 static void log_hexdump(void *data, size_t len);
805 
806 unsigned int net_rxpoll = 1;
807 unsigned int net_affinity = 1;
808 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
809 
810 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
811 
812 extern u_int32_t        inject_buckets;
813 
814 /* DLIL data threshold thread call */
815 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
816 
817 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)818 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
819 {
820 	/*
821 	 * update filter count and route_generation ID to let TCP
822 	 * know it should reevalute doing TSO or not
823 	 */
824 	if (filter_enable) {
825 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
826 	} else {
827 		VERIFY(ifp->if_flt_no_tso_count != 0);
828 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
829 	}
830 	routegenid_update();
831 }
832 
833 #if SKYWALK
834 
835 #if defined(XNU_TARGET_OS_OSX)
836 static bool net_check_compatible_if_filter(struct ifnet *ifp);
837 #endif /* XNU_TARGET_OS_OSX */
838 
839 /* if_attach_nx flags defined in os_skywalk_private.h */
840 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
841 unsigned int if_enable_fsw_ip_netagent =
842     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
843 unsigned int if_enable_fsw_transport_netagent =
844     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
845 
846 unsigned int if_netif_all =
847     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
848 
849 /* Configure flowswitch to use max mtu sized buffer */
850 static bool fsw_use_max_mtu_buffer = false;
851 
852 #if (DEVELOPMENT || DEBUG)
853 static int
854 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
855 {
856 #pragma unused(oidp, arg1, arg2)
857 	unsigned int new_value;
858 	int changed;
859 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
860 	    &new_value, &changed);
861 	if (error) {
862 		return error;
863 	}
864 	if (changed) {
865 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
866 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
867 			return ENOTSUP;
868 		}
869 		if_attach_nx = new_value;
870 	}
871 	return 0;
872 }
873 
874 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
875     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
876     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
877 
878 #endif /* DEVELOPMENT || DEBUG */
879 
880 static int
881 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
882 {
883 #pragma unused(oidp, arg1, arg2)
884 	unsigned int new_value;
885 	int changed;
886 	int error;
887 
888 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
889 	    sizeof(if_enable_fsw_transport_netagent),
890 	    &new_value, &changed);
891 	if (error == 0 && changed != 0) {
892 		if (new_value != 0 && new_value != 1) {
893 			/* only allow 0 or 1 */
894 			error = EINVAL;
895 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
896 			/* netagent can be enabled/disabled */
897 			if_enable_fsw_transport_netagent = new_value;
898 			if (new_value == 0) {
899 				kern_nexus_deregister_netagents();
900 			} else {
901 				kern_nexus_register_netagents();
902 			}
903 		} else {
904 			/* netagent can't be enabled */
905 			error = ENOTSUP;
906 		}
907 	}
908 	return error;
909 }
910 
911 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
912     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
913     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
914     "enable flowswitch netagent");
915 
916 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
917 
918 #include <skywalk/os_skywalk_private.h>
919 
920 boolean_t
ifnet_nx_noauto(ifnet_t ifp)921 ifnet_nx_noauto(ifnet_t ifp)
922 {
923 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
924 }
925 
926 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)927 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
928 {
929 	return ifnet_is_low_latency(ifp);
930 }
931 
932 boolean_t
ifnet_is_low_latency(ifnet_t ifp)933 ifnet_is_low_latency(ifnet_t ifp)
934 {
935 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
936 }
937 
938 boolean_t
ifnet_needs_compat(ifnet_t ifp)939 ifnet_needs_compat(ifnet_t ifp)
940 {
941 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
942 		return FALSE;
943 	}
944 #if !XNU_TARGET_OS_OSX
945 	/*
946 	 * To conserve memory, we plumb in the compat layer selectively; this
947 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
948 	 * In particular, we check for Wi-Fi Access Point.
949 	 */
950 	if (IFNET_IS_WIFI(ifp)) {
951 		/* Wi-Fi Access Point */
952 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
953 		    ifp->if_name[2] == '\0') {
954 			return if_netif_all;
955 		}
956 	}
957 #else /* XNU_TARGET_OS_OSX */
958 #pragma unused(ifp)
959 #endif /* XNU_TARGET_OS_OSX */
960 	return TRUE;
961 }
962 
963 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)964 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
965 {
966 	if (if_is_fsw_transport_netagent_enabled()) {
967 		/* check if netagent has been manually enabled for ipsec/utun */
968 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
969 			return ipsec_interface_needs_netagent(ifp);
970 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
971 			return utun_interface_needs_netagent(ifp);
972 		}
973 
974 		/* check ifnet no auto nexus override */
975 		if (ifnet_nx_noauto(ifp)) {
976 			return FALSE;
977 		}
978 
979 		/* check global if_attach_nx configuration */
980 		switch (ifp->if_family) {
981 		case IFNET_FAMILY_CELLULAR:
982 		case IFNET_FAMILY_ETHERNET:
983 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
984 				return TRUE;
985 			}
986 			break;
987 		default:
988 			break;
989 		}
990 	}
991 	return FALSE;
992 }
993 
994 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)995 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
996 {
997 #pragma unused(ifp)
998 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
999 		return TRUE;
1000 	}
1001 	return FALSE;
1002 }
1003 
1004 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1005 ifnet_needs_netif_netagent(ifnet_t ifp)
1006 {
1007 #pragma unused(ifp)
1008 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1009 }
1010 
1011 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1012 dlil_detach_nexus_instance(nexus_controller_t controller,
1013     const char *func_str, uuid_t instance, uuid_t device)
1014 {
1015 	errno_t         err;
1016 
1017 	if (instance == NULL || uuid_is_null(instance)) {
1018 		return FALSE;
1019 	}
1020 
1021 	/* followed by the device port */
1022 	if (device != NULL && !uuid_is_null(device)) {
1023 		err = kern_nexus_ifdetach(controller, instance, device);
1024 		if (err != 0) {
1025 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1026 			    func_str, err);
1027 		}
1028 	}
1029 	err = kern_nexus_controller_free_provider_instance(controller,
1030 	    instance);
1031 	if (err != 0) {
1032 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1033 		    func_str, err);
1034 	}
1035 	return TRUE;
1036 }
1037 
1038 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1039 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1040     uuid_t device)
1041 {
1042 	boolean_t               detached = FALSE;
1043 	nexus_controller_t      controller = kern_nexus_shared_controller();
1044 	int                     err;
1045 
1046 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1047 	    device)) {
1048 		detached = TRUE;
1049 	}
1050 	if (provider != NULL && !uuid_is_null(provider)) {
1051 		detached = TRUE;
1052 		err = kern_nexus_controller_deregister_provider(controller,
1053 		    provider);
1054 		if (err != 0) {
1055 			DLIL_PRINTF("%s deregister_provider %d\n",
1056 			    func_str, err);
1057 		}
1058 	}
1059 	return detached;
1060 }
1061 
1062 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1063 dlil_create_provider_and_instance(nexus_controller_t controller,
1064     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1065     nexus_attr_t attr)
1066 {
1067 	uuid_t          dom_prov;
1068 	errno_t         err;
1069 	nexus_name_t    provider_name;
1070 	const char      *type_name =
1071 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1072 	struct kern_nexus_init init;
1073 
1074 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1075 	if (err != 0) {
1076 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1077 		    __func__, type_name, err);
1078 		goto failed;
1079 	}
1080 
1081 	snprintf((char *)provider_name, sizeof(provider_name),
1082 	    "com.apple.%s.%s", type_name, if_name(ifp));
1083 	err = kern_nexus_controller_register_provider(controller,
1084 	    dom_prov,
1085 	    provider_name,
1086 	    NULL,
1087 	    0,
1088 	    attr,
1089 	    provider);
1090 	if (err != 0) {
1091 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1092 		    __func__, type_name, err);
1093 		goto failed;
1094 	}
1095 	bzero(&init, sizeof(init));
1096 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1097 	err = kern_nexus_controller_alloc_provider_instance(controller,
1098 	    *provider,
1099 	    NULL, NULL,
1100 	    instance, &init);
1101 	if (err != 0) {
1102 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1103 		    __func__, type_name, err);
1104 		kern_nexus_controller_deregister_provider(controller,
1105 		    *provider);
1106 		goto failed;
1107 	}
1108 failed:
1109 	return err;
1110 }
1111 
1112 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1113 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1114 {
1115 	nexus_attr_t            attr = NULL;
1116 	nexus_controller_t      controller;
1117 	errno_t                 err;
1118 
1119 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1120 		/* it's already attached */
1121 		if (dlil_verbose) {
1122 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1123 			    __func__, if_name(ifp));
1124 			/* already attached */
1125 		}
1126 		goto failed;
1127 	}
1128 
1129 	err = kern_nexus_attr_create(&attr);
1130 	if (err != 0) {
1131 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1132 		    if_name(ifp));
1133 		goto failed;
1134 	}
1135 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1136 	VERIFY(err == 0);
1137 
1138 	controller = kern_nexus_shared_controller();
1139 
1140 	/* create the netif provider and instance */
1141 	err = dlil_create_provider_and_instance(controller,
1142 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1143 	    &netif_nx->if_nif_instance, attr);
1144 	if (err != 0) {
1145 		goto failed;
1146 	}
1147 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1148 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1149 	if (err != 0) {
1150 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1151 		    __func__, err);
1152 		/* cleanup provider and instance */
1153 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1154 		    netif_nx->if_nif_instance, NULL);
1155 		goto failed;
1156 	}
1157 	return TRUE;
1158 
1159 failed:
1160 	if (attr != NULL) {
1161 		kern_nexus_attr_destroy(attr);
1162 	}
1163 	return FALSE;
1164 }
1165 
1166 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1167 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1168 {
1169 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1170 	    IFNET_IS_VMNET(ifp)) {
1171 		goto failed;
1172 	}
1173 	switch (ifp->if_type) {
1174 	case IFT_CELLULAR:
1175 	case IFT_ETHER:
1176 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1177 			/* don't auto-attach */
1178 			goto failed;
1179 		}
1180 		break;
1181 	default:
1182 		/* don't auto-attach */
1183 		goto failed;
1184 	}
1185 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1186 
1187 failed:
1188 	return FALSE;
1189 }
1190 
1191 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1192 dlil_is_native_netif_nexus(ifnet_t ifp)
1193 {
1194 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1195 }
1196 
1197 __attribute__((noinline))
1198 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1199 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1200 {
1201 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1202 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1203 }
1204 
1205 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1206 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1207 {
1208 	struct ifreq        ifr;
1209 	int                 error;
1210 
1211 	bzero(&ifr, sizeof(ifr));
1212 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1213 	if (error == 0) {
1214 		*ifdm_p = ifr.ifr_devmtu;
1215 	}
1216 	return error;
1217 }
1218 
1219 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1220 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1221     bool *use_multi_buflet, uint32_t *large_buf_size)
1222 {
1223 	struct kern_pbufpool_memory_info rx_pp_info;
1224 	struct kern_pbufpool_memory_info tx_pp_info;
1225 	uint32_t if_max_mtu = 0;
1226 	uint32_t drv_buf_size;
1227 	struct ifdevmtu ifdm;
1228 	int err;
1229 
1230 	/*
1231 	 * To perform intra-stack RX aggregation flowswitch needs to use
1232 	 * multi-buflet packet.
1233 	 */
1234 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1235 
1236 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1237 	/*
1238 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1239 	 * but the driver advertises the MAX MTU as only 9K.
1240 	 */
1241 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1242 		if_max_mtu = IP_MAXPACKET;
1243 		goto skip_mtu_ioctl;
1244 	}
1245 
1246 	/* determine max mtu */
1247 	bzero(&ifdm, sizeof(ifdm));
1248 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1249 	if (__improbable(err != 0)) {
1250 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1251 		    __func__, if_name(ifp));
1252 		/* use default flowswitch buffer size */
1253 		if_max_mtu = NX_FSW_BUFSIZE;
1254 	} else {
1255 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1256 		    ifdm.ifdm_max, ifdm.ifdm_current);
1257 		/* rdar://problem/44589731 */
1258 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1259 	}
1260 
1261 skip_mtu_ioctl:
1262 	if (if_max_mtu == 0) {
1263 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1264 		    __func__, if_name(ifp));
1265 		return EINVAL;
1266 	}
1267 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1268 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1269 		    "max bufsize(%d)\n", __func__,
1270 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1271 		return EINVAL;
1272 	}
1273 
1274 	/*
1275 	 * for skywalk native driver, consult the driver packet pool also.
1276 	 */
1277 	if (dlil_is_native_netif_nexus(ifp)) {
1278 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1279 		    &tx_pp_info);
1280 		if (err != 0) {
1281 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1282 			    __func__, if_name(ifp));
1283 			return ENXIO;
1284 		}
1285 		drv_buf_size = tx_pp_info.kpm_bufsize *
1286 		    tx_pp_info.kpm_max_frags;
1287 		if (if_max_mtu > drv_buf_size) {
1288 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1289 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1290 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1291 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1292 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1293 			return EINVAL;
1294 		}
1295 	} else {
1296 		drv_buf_size = if_max_mtu;
1297 	}
1298 
1299 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1300 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1301 		*use_multi_buflet = true;
1302 		/* default flowswitch buffer size */
1303 		*buf_size = NX_FSW_BUFSIZE;
1304 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1305 	} else {
1306 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1307 	}
1308 
1309 	/*
1310 	 * if HW TSO is enabled on a Skywalk native interface then make
1311 	 * the flowswitch default buffer be able to handle max TSO segment.
1312 	 */
1313 	uint32_t tso_v4_mtu = 0;
1314 	uint32_t tso_v6_mtu = 0;
1315 #ifdef XNU_TARGET_OS_OSX
1316 	if (dlil_is_native_netif_nexus(ifp)) {
1317 		if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1318 			tso_v4_mtu = ifp->if_tso_v4_mtu;
1319 		}
1320 		if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1321 			tso_v6_mtu = ifp->if_tso_v6_mtu;
1322 		}
1323 	}
1324 #endif /* XNU_TARGET_OS_OSX */
1325 	if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1326 		*buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1327 		ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1328 	}
1329 	if (*buf_size >= *large_buf_size) {
1330 		*large_buf_size = 0;
1331 	}
1332 	return 0;
1333 }
1334 
1335 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1336 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1337 {
1338 	nexus_attr_t            attr = NULL;
1339 	nexus_controller_t      controller;
1340 	errno_t                 err = 0;
1341 	uuid_t                  netif;
1342 	uint32_t                buf_size = 0;
1343 	uint32_t                large_buf_size = 0;
1344 	bool                    multi_buflet;
1345 
1346 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1347 	    IFNET_IS_VMNET(ifp)) {
1348 		goto failed;
1349 	}
1350 
1351 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1352 		/* not possible to attach (netif native/compat not plumbed) */
1353 		goto failed;
1354 	}
1355 
1356 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1357 		/* don't auto-attach */
1358 		goto failed;
1359 	}
1360 
1361 	/* get the netif instance from the ifp */
1362 	err = kern_nexus_get_netif_instance(ifp, netif);
1363 	if (err != 0) {
1364 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1365 		    if_name(ifp));
1366 		goto failed;
1367 	}
1368 
1369 	err = kern_nexus_attr_create(&attr);
1370 	if (err != 0) {
1371 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1372 		    if_name(ifp));
1373 		goto failed;
1374 	}
1375 
1376 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1377 	    &multi_buflet, &large_buf_size);
1378 	if (err != 0) {
1379 		goto failed;
1380 	}
1381 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1382 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1383 
1384 	/* Configure flowswitch buffer size */
1385 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1386 	VERIFY(err == 0);
1387 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1388 	    large_buf_size);
1389 	VERIFY(err == 0);
1390 
1391 	/*
1392 	 * Configure flowswitch to use super-packet (multi-buflet).
1393 	 */
1394 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1395 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1396 	VERIFY(err == 0);
1397 
1398 	/* create the flowswitch provider and instance */
1399 	controller = kern_nexus_shared_controller();
1400 	err = dlil_create_provider_and_instance(controller,
1401 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1402 	    &nexus_fsw->if_fsw_instance, attr);
1403 	if (err != 0) {
1404 		goto failed;
1405 	}
1406 
1407 	/* attach the device port */
1408 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1409 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1410 	if (err != 0) {
1411 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1412 		    __func__, err, if_name(ifp));
1413 		/* cleanup provider and instance */
1414 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1415 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1416 		goto failed;
1417 	}
1418 	return TRUE;
1419 
1420 failed:
1421 	if (err != 0) {
1422 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1423 		    __func__, if_name(ifp), err);
1424 	} else {
1425 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1426 		    __func__, if_name(ifp));
1427 	}
1428 	if (attr != NULL) {
1429 		kern_nexus_attr_destroy(attr);
1430 	}
1431 	return FALSE;
1432 }
1433 
1434 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1435 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1436 {
1437 	boolean_t               attached;
1438 	if_nexus_flowswitch     nexus_fsw;
1439 
1440 #if (DEVELOPMENT || DEBUG)
1441 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1442 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1443 		return FALSE;
1444 	}
1445 #endif /* (DEVELOPMENT || DEBUG) */
1446 
1447 	/*
1448 	 * flowswitch attachment is not supported for interface using the
1449 	 * legacy model (IFNET_INIT_LEGACY)
1450 	 */
1451 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1452 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1453 		    if_name(ifp));
1454 		return FALSE;
1455 	}
1456 
1457 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1458 		/* it's already attached */
1459 		return FALSE;
1460 	}
1461 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1462 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1463 	if (attached) {
1464 		ifnet_lock_exclusive(ifp);
1465 		if (!IF_FULLY_ATTACHED(ifp)) {
1466 			/* interface is going away */
1467 			attached = FALSE;
1468 		} else {
1469 			ifp->if_nx_flowswitch = nexus_fsw;
1470 		}
1471 		ifnet_lock_done(ifp);
1472 		if (!attached) {
1473 			/* clean up flowswitch nexus */
1474 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1475 		}
1476 	}
1477 	return attached;
1478 }
1479 
1480 __attribute__((noinline))
1481 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1482 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1483 {
1484 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1485 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1486 }
1487 
1488 __attribute__((noinline))
1489 static void
dlil_netif_detach_notify(ifnet_t ifp)1490 dlil_netif_detach_notify(ifnet_t ifp)
1491 {
1492 	void (*detach_notify)(struct nexus_netif_adapter *);
1493 
1494 	/*
1495 	 * This is only needed for low latency interfaces for now.
1496 	 */
1497 	if (!ifnet_is_low_latency(ifp)) {
1498 		return;
1499 	}
1500 	detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1501 	if (detach_notify != NULL) {
1502 		(*detach_notify)(ifp->if_na);
1503 	} else {
1504 		DLIL_PRINTF("%s: %s has no detach notify calback\n",
1505 		    __func__, if_name(ifp));
1506 	}
1507 }
1508 
1509 __attribute__((noinline))
1510 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1511 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1512 {
1513 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1514 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1515 
1516 	ifnet_datamov_suspend_and_drain(ifp);
1517 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1518 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1519 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1520 		dlil_detach_flowswitch_nexus(nx_fsw);
1521 		bzero(nx_fsw, sizeof(*nx_fsw));
1522 	} else {
1523 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1524 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1525 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1526 	}
1527 
1528 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1529 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1530 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1531 		dlil_detach_netif_nexus(nx_netif);
1532 		bzero(nx_netif, sizeof(*nx_netif));
1533 	} else {
1534 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1535 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1536 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1537 	}
1538 	ifnet_datamov_resume(ifp);
1539 }
1540 
1541 boolean_t
ifnet_add_netagent(ifnet_t ifp)1542 ifnet_add_netagent(ifnet_t ifp)
1543 {
1544 	int     error;
1545 
1546 	error = kern_nexus_interface_add_netagent(ifp);
1547 	os_log(OS_LOG_DEFAULT,
1548 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1549 	    ifp->if_xname, error);
1550 	return error == 0;
1551 }
1552 
1553 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1554 ifnet_remove_netagent(ifnet_t ifp)
1555 {
1556 	int     error;
1557 
1558 	error = kern_nexus_interface_remove_netagent(ifp);
1559 	os_log(OS_LOG_DEFAULT,
1560 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1561 	    ifp->if_xname, error);
1562 	return error == 0;
1563 }
1564 
1565 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1566 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1567 {
1568 	if (!IF_FULLY_ATTACHED(ifp)) {
1569 		return FALSE;
1570 	}
1571 	return dlil_attach_flowswitch_nexus(ifp);
1572 }
1573 
1574 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1575 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1576 {
1577 	if_nexus_flowswitch     nexus_fsw;
1578 
1579 	ifnet_lock_exclusive(ifp);
1580 	nexus_fsw = ifp->if_nx_flowswitch;
1581 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1582 	ifnet_lock_done(ifp);
1583 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1584 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1585 }
1586 
1587 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1588 ifnet_attach_netif_nexus(ifnet_t ifp)
1589 {
1590 	boolean_t       nexus_attached;
1591 	if_nexus_netif  nexus_netif;
1592 
1593 	if (!IF_FULLY_ATTACHED(ifp)) {
1594 		return FALSE;
1595 	}
1596 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1597 	if (nexus_attached) {
1598 		ifnet_lock_exclusive(ifp);
1599 		ifp->if_nx_netif = nexus_netif;
1600 		ifnet_lock_done(ifp);
1601 	}
1602 	return nexus_attached;
1603 }
1604 
1605 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1606 ifnet_detach_netif_nexus(ifnet_t ifp)
1607 {
1608 	if_nexus_netif  nexus_netif;
1609 
1610 	ifnet_lock_exclusive(ifp);
1611 	nexus_netif = ifp->if_nx_netif;
1612 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1613 	ifnet_lock_done(ifp);
1614 
1615 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1616 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1617 }
1618 
1619 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1620 ifnet_attach_native_flowswitch(ifnet_t ifp)
1621 {
1622 	if (!dlil_is_native_netif_nexus(ifp)) {
1623 		/* not a native netif */
1624 		return;
1625 	}
1626 	ifnet_attach_flowswitch_nexus(ifp);
1627 }
1628 
1629 #endif /* SKYWALK */
1630 
1631 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1632 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1633 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1634 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1635 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1636 	/* NOTREACHED */                                        \
1637 	}                                                               \
1638 }
1639 
1640 #define DLIL_EWMA(old, new, decay) do {                                 \
1641 	u_int32_t _avg;                                                 \
1642 	if ((_avg = (old)) > 0)                                         \
1643 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1644 	else                                                            \
1645 	        _avg = (new);                                           \
1646 	(old) = _avg;                                                   \
1647 } while (0)
1648 
1649 #define MBPS    (1ULL * 1000 * 1000)
1650 #define GBPS    (MBPS * 1000)
1651 
1652 struct rxpoll_time_tbl {
1653 	u_int64_t       speed;          /* downlink speed */
1654 	u_int32_t       plowat;         /* packets low watermark */
1655 	u_int32_t       phiwat;         /* packets high watermark */
1656 	u_int32_t       blowat;         /* bytes low watermark */
1657 	u_int32_t       bhiwat;         /* bytes high watermark */
1658 };
1659 
1660 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1661 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1662 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1663 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1664 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1665 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1666 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1667 };
1668 
1669 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1670     &dlil_lck_attributes);
1671 static uint32_t dlil_pending_thread_cnt = 0;
1672 
1673 static void
dlil_incr_pending_thread_count(void)1674 dlil_incr_pending_thread_count(void)
1675 {
1676 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1677 	lck_mtx_lock(&dlil_thread_sync_lock);
1678 	dlil_pending_thread_cnt++;
1679 	lck_mtx_unlock(&dlil_thread_sync_lock);
1680 }
1681 
1682 static void
dlil_decr_pending_thread_count(void)1683 dlil_decr_pending_thread_count(void)
1684 {
1685 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1686 	lck_mtx_lock(&dlil_thread_sync_lock);
1687 	VERIFY(dlil_pending_thread_cnt > 0);
1688 	dlil_pending_thread_cnt--;
1689 	if (dlil_pending_thread_cnt == 0) {
1690 		wakeup(&dlil_pending_thread_cnt);
1691 	}
1692 	lck_mtx_unlock(&dlil_thread_sync_lock);
1693 }
1694 
1695 int
proto_hash_value(u_int32_t protocol_family)1696 proto_hash_value(u_int32_t protocol_family)
1697 {
1698 	/*
1699 	 * dlil_proto_unplumb_all() depends on the mapping between
1700 	 * the hash bucket index and the protocol family defined
1701 	 * here; future changes must be applied there as well.
1702 	 */
1703 	switch (protocol_family) {
1704 	case PF_INET:
1705 		return 0;
1706 	case PF_INET6:
1707 		return 1;
1708 	case PF_VLAN:
1709 		return 2;
1710 	case PF_UNSPEC:
1711 	default:
1712 		return 3;
1713 	}
1714 }
1715 
1716 /*
1717  * Caller must already be holding ifnet lock.
1718  */
1719 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1720 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1721 {
1722 	struct if_proto *proto = NULL;
1723 	u_int32_t i = proto_hash_value(protocol_family);
1724 
1725 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1726 
1727 	if (ifp->if_proto_hash != NULL) {
1728 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1729 	}
1730 
1731 	while (proto != NULL && proto->protocol_family != protocol_family) {
1732 		proto = SLIST_NEXT(proto, next_hash);
1733 	}
1734 
1735 	if (proto != NULL) {
1736 		if_proto_ref(proto);
1737 	}
1738 
1739 	return proto;
1740 }
1741 
1742 static void
if_proto_ref(struct if_proto * proto)1743 if_proto_ref(struct if_proto *proto)
1744 {
1745 	atomic_add_32(&proto->refcount, 1);
1746 }
1747 
1748 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1749 
1750 static void
if_proto_free(struct if_proto * proto)1751 if_proto_free(struct if_proto *proto)
1752 {
1753 	u_int32_t oldval;
1754 	struct ifnet *ifp = proto->ifp;
1755 	u_int32_t proto_family = proto->protocol_family;
1756 	struct kev_dl_proto_data ev_pr_data;
1757 
1758 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1759 	if (oldval > 1) {
1760 		return;
1761 	}
1762 
1763 	if (proto->proto_kpi == kProtoKPI_v1) {
1764 		if (proto->kpi.v1.detached) {
1765 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1766 		}
1767 	}
1768 	if (proto->proto_kpi == kProtoKPI_v2) {
1769 		if (proto->kpi.v2.detached) {
1770 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1771 		}
1772 	}
1773 
1774 	/*
1775 	 * Cleanup routes that may still be in the routing table for that
1776 	 * interface/protocol pair.
1777 	 */
1778 	if_rtproto_del(ifp, proto_family);
1779 
1780 	ifnet_lock_shared(ifp);
1781 
1782 	/* No more reference on this, protocol must have been detached */
1783 	VERIFY(proto->detached);
1784 
1785 	/*
1786 	 * The reserved field carries the number of protocol still attached
1787 	 * (subject to change)
1788 	 */
1789 	ev_pr_data.proto_family = proto_family;
1790 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1791 
1792 	ifnet_lock_done(ifp);
1793 
1794 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1795 	    (struct net_event_data *)&ev_pr_data,
1796 	    sizeof(struct kev_dl_proto_data), FALSE);
1797 
1798 	if (ev_pr_data.proto_remaining_count == 0) {
1799 		/*
1800 		 * The protocol count has gone to zero, mark the interface down.
1801 		 * This used to be done by configd.KernelEventMonitor, but that
1802 		 * is inherently prone to races (rdar://problem/30810208).
1803 		 */
1804 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1805 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1806 		dlil_post_sifflags_msg(ifp);
1807 	}
1808 
1809 	zfree(dlif_proto_zone, proto);
1810 }
1811 
1812 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1813 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1814 {
1815 #if !MACH_ASSERT
1816 #pragma unused(ifp)
1817 #endif
1818 	unsigned int type = 0;
1819 	int ass = 1;
1820 
1821 	switch (what) {
1822 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1823 		type = LCK_RW_ASSERT_EXCLUSIVE;
1824 		break;
1825 
1826 	case IFNET_LCK_ASSERT_SHARED:
1827 		type = LCK_RW_ASSERT_SHARED;
1828 		break;
1829 
1830 	case IFNET_LCK_ASSERT_OWNED:
1831 		type = LCK_RW_ASSERT_HELD;
1832 		break;
1833 
1834 	case IFNET_LCK_ASSERT_NOTOWNED:
1835 		/* nothing to do here for RW lock; bypass assert */
1836 		ass = 0;
1837 		break;
1838 
1839 	default:
1840 		panic("bad ifnet assert type: %d", what);
1841 		/* NOTREACHED */
1842 	}
1843 	if (ass) {
1844 		LCK_RW_ASSERT(&ifp->if_lock, type);
1845 	}
1846 }
1847 
1848 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1849 ifnet_lock_shared(struct ifnet *ifp)
1850 {
1851 	lck_rw_lock_shared(&ifp->if_lock);
1852 }
1853 
1854 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1855 ifnet_lock_exclusive(struct ifnet *ifp)
1856 {
1857 	lck_rw_lock_exclusive(&ifp->if_lock);
1858 }
1859 
1860 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1861 ifnet_lock_done(struct ifnet *ifp)
1862 {
1863 	lck_rw_done(&ifp->if_lock);
1864 }
1865 
1866 #if INET
1867 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1868 if_inetdata_lock_shared(struct ifnet *ifp)
1869 {
1870 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1871 }
1872 
1873 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1874 if_inetdata_lock_exclusive(struct ifnet *ifp)
1875 {
1876 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1877 }
1878 
1879 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1880 if_inetdata_lock_done(struct ifnet *ifp)
1881 {
1882 	lck_rw_done(&ifp->if_inetdata_lock);
1883 }
1884 #endif
1885 
1886 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1887 if_inet6data_lock_shared(struct ifnet *ifp)
1888 {
1889 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1890 }
1891 
1892 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1893 if_inet6data_lock_exclusive(struct ifnet *ifp)
1894 {
1895 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1896 }
1897 
1898 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1899 if_inet6data_lock_done(struct ifnet *ifp)
1900 {
1901 	lck_rw_done(&ifp->if_inet6data_lock);
1902 }
1903 
1904 __private_extern__ void
ifnet_head_lock_shared(void)1905 ifnet_head_lock_shared(void)
1906 {
1907 	lck_rw_lock_shared(&ifnet_head_lock);
1908 }
1909 
1910 __private_extern__ void
ifnet_head_lock_exclusive(void)1911 ifnet_head_lock_exclusive(void)
1912 {
1913 	lck_rw_lock_exclusive(&ifnet_head_lock);
1914 }
1915 
1916 __private_extern__ void
ifnet_head_done(void)1917 ifnet_head_done(void)
1918 {
1919 	lck_rw_done(&ifnet_head_lock);
1920 }
1921 
1922 __private_extern__ void
ifnet_head_assert_exclusive(void)1923 ifnet_head_assert_exclusive(void)
1924 {
1925 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1926 }
1927 
1928 /*
1929  * dlil_ifp_protolist
1930  * - get the list of protocols attached to the interface, or just the number
1931  *   of attached protocols
1932  * - if the number returned is greater than 'list_count', truncation occurred
1933  *
1934  * Note:
1935  * - caller must already be holding ifnet lock.
1936  */
1937 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1938 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1939     u_int32_t list_count)
1940 {
1941 	u_int32_t       count = 0;
1942 	int             i;
1943 
1944 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1945 
1946 	if (ifp->if_proto_hash == NULL) {
1947 		goto done;
1948 	}
1949 
1950 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1951 		struct if_proto *proto;
1952 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1953 			if (list != NULL && count < list_count) {
1954 				list[count] = proto->protocol_family;
1955 			}
1956 			count++;
1957 		}
1958 	}
1959 done:
1960 	return count;
1961 }
1962 
1963 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1964 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1965 {
1966 	ifnet_lock_shared(ifp);
1967 	count = dlil_ifp_protolist(ifp, protolist, count);
1968 	ifnet_lock_done(ifp);
1969 	return count;
1970 }
1971 
1972 __private_extern__ void
if_free_protolist(u_int32_t * list)1973 if_free_protolist(u_int32_t *list)
1974 {
1975 	kfree_data_addr(list);
1976 }
1977 
1978 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1979 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1980     u_int32_t event_code, struct net_event_data *event_data,
1981     u_int32_t event_data_len, boolean_t suppress_generation)
1982 {
1983 	struct net_event_data ev_data;
1984 	struct kev_msg ev_msg;
1985 
1986 	bzero(&ev_msg, sizeof(ev_msg));
1987 	bzero(&ev_data, sizeof(ev_data));
1988 	/*
1989 	 * a net event always starts with a net_event_data structure
1990 	 * but the caller can generate a simple net event or
1991 	 * provide a longer event structure to post
1992 	 */
1993 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1994 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1995 	ev_msg.kev_subclass     = event_subclass;
1996 	ev_msg.event_code       = event_code;
1997 
1998 	if (event_data == NULL) {
1999 		event_data = &ev_data;
2000 		event_data_len = sizeof(struct net_event_data);
2001 	}
2002 
2003 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2004 	event_data->if_family = ifp->if_family;
2005 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2006 
2007 	ev_msg.dv[0].data_length = event_data_len;
2008 	ev_msg.dv[0].data_ptr    = event_data;
2009 	ev_msg.dv[1].data_length = 0;
2010 
2011 	bool update_generation = true;
2012 	if (event_subclass == KEV_DL_SUBCLASS) {
2013 		/* Don't update interface generation for frequent link quality and state changes  */
2014 		switch (event_code) {
2015 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2016 		case KEV_DL_RRC_STATE_CHANGED:
2017 		case KEV_DL_PRIMARY_ELECTED:
2018 			update_generation = false;
2019 			break;
2020 		default:
2021 			break;
2022 		}
2023 	}
2024 
2025 	/*
2026 	 * Some events that update generation counts might
2027 	 * want to suppress generation count.
2028 	 * One example is node presence/absence where we still
2029 	 * issue kernel event for the invocation but want to avoid
2030 	 * expensive operation of updating generation which triggers
2031 	 * NECP client updates.
2032 	 */
2033 	if (suppress_generation) {
2034 		update_generation = false;
2035 	}
2036 
2037 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2038 }
2039 
2040 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2041 dlil_alloc_local_stats(struct ifnet *ifp)
2042 {
2043 	int ret = EINVAL;
2044 	void *buf, *base, **pbuf;
2045 
2046 	if (ifp == NULL) {
2047 		goto end;
2048 	}
2049 
2050 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2051 		/* allocate tcpstat_local structure */
2052 		buf = zalloc_flags(dlif_tcpstat_zone,
2053 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2054 
2055 		/* Get the 64-bit aligned base address for this object */
2056 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2057 		    sizeof(u_int64_t));
2058 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2059 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2060 
2061 		/*
2062 		 * Wind back a pointer size from the aligned base and
2063 		 * save the original address so we can free it later.
2064 		 */
2065 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2066 		*pbuf = buf;
2067 		ifp->if_tcp_stat = base;
2068 
2069 		/* allocate udpstat_local structure */
2070 		buf = zalloc_flags(dlif_udpstat_zone,
2071 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2072 
2073 		/* Get the 64-bit aligned base address for this object */
2074 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2075 		    sizeof(u_int64_t));
2076 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2077 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2078 
2079 		/*
2080 		 * Wind back a pointer size from the aligned base and
2081 		 * save the original address so we can free it later.
2082 		 */
2083 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2084 		*pbuf = buf;
2085 		ifp->if_udp_stat = base;
2086 
2087 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2088 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2089 
2090 		ret = 0;
2091 	}
2092 
2093 	if (ifp->if_ipv4_stat == NULL) {
2094 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2095 	}
2096 
2097 	if (ifp->if_ipv6_stat == NULL) {
2098 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2099 	}
2100 end:
2101 	if (ifp != NULL && ret != 0) {
2102 		if (ifp->if_tcp_stat != NULL) {
2103 			pbuf = (void **)
2104 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2105 			zfree(dlif_tcpstat_zone, *pbuf);
2106 			ifp->if_tcp_stat = NULL;
2107 		}
2108 		if (ifp->if_udp_stat != NULL) {
2109 			pbuf = (void **)
2110 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2111 			zfree(dlif_udpstat_zone, *pbuf);
2112 			ifp->if_udp_stat = NULL;
2113 		}
2114 		/* The macro kfree_type sets the passed pointer to NULL */
2115 		if (ifp->if_ipv4_stat != NULL) {
2116 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2117 		}
2118 		if (ifp->if_ipv6_stat != NULL) {
2119 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2120 		}
2121 	}
2122 
2123 	return ret;
2124 }
2125 
2126 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2127 dlil_reset_rxpoll_params(ifnet_t ifp)
2128 {
2129 	ASSERT(ifp != NULL);
2130 	ifnet_set_poll_cycle(ifp, NULL);
2131 	ifp->if_poll_update = 0;
2132 	ifp->if_poll_flags = 0;
2133 	ifp->if_poll_req = 0;
2134 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2135 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2136 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2137 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2138 	net_timerclear(&ifp->if_poll_mode_holdtime);
2139 	net_timerclear(&ifp->if_poll_mode_lasttime);
2140 	net_timerclear(&ifp->if_poll_sample_holdtime);
2141 	net_timerclear(&ifp->if_poll_sample_lasttime);
2142 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2143 }
2144 
2145 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2146 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2147     thread_continue_t *thfunc)
2148 {
2149 	boolean_t dlil_rxpoll_input;
2150 	thread_continue_t func = NULL;
2151 	u_int32_t limit;
2152 	int error = 0;
2153 
2154 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2155 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2156 
2157 	/* default strategy utilizes the DLIL worker thread */
2158 	inp->dlth_strategy = dlil_input_async;
2159 
2160 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2161 	if (ifp == NULL) {
2162 		/*
2163 		 * Main input thread only.
2164 		 */
2165 		func = dlil_main_input_thread_func;
2166 		VERIFY(inp == dlil_main_input_thread);
2167 		(void) strlcat(inp->dlth_name,
2168 		    "main_input", DLIL_THREADNAME_LEN);
2169 	} else if (dlil_rxpoll_input) {
2170 		/*
2171 		 * Legacy (non-netif) hybrid polling.
2172 		 */
2173 		func = dlil_rxpoll_input_thread_func;
2174 		VERIFY(inp != dlil_main_input_thread);
2175 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2176 		    "%s_input_poll", if_name(ifp));
2177 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2178 		/*
2179 		 * Asynchronous strategy.
2180 		 */
2181 		func = dlil_input_thread_func;
2182 		VERIFY(inp != dlil_main_input_thread);
2183 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2184 		    "%s_input", if_name(ifp));
2185 	} else {
2186 		/*
2187 		 * Synchronous strategy if there's a netif below and
2188 		 * the device isn't capable of hybrid polling.
2189 		 */
2190 		ASSERT(func == NULL);
2191 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2192 		VERIFY(inp != dlil_main_input_thread);
2193 		ASSERT(!inp->dlth_affinity);
2194 		inp->dlth_strategy = dlil_input_sync;
2195 	}
2196 	VERIFY(inp->dlth_thread == THREAD_NULL);
2197 
2198 	/* let caller know */
2199 	if (thfunc != NULL) {
2200 		*thfunc = func;
2201 	}
2202 
2203 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2204 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2205 
2206 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2207 	/*
2208 	 * For interfaces that support opportunistic polling, set the
2209 	 * low and high watermarks for outstanding inbound packets/bytes.
2210 	 * Also define freeze times for transitioning between modes
2211 	 * and updating the average.
2212 	 */
2213 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2214 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2215 		if (ifp->if_xflags & IFXF_LEGACY) {
2216 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2217 		}
2218 	} else {
2219 		limit = (u_int32_t)-1;
2220 	}
2221 
2222 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2223 	if (inp == dlil_main_input_thread) {
2224 		struct dlil_main_threading_info *inpm =
2225 		    (struct dlil_main_threading_info *)inp;
2226 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2227 	}
2228 
2229 	if (func == NULL) {
2230 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2231 		ASSERT(error == 0);
2232 		error = ENODEV;
2233 		goto done;
2234 	}
2235 
2236 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2237 	if (error == KERN_SUCCESS) {
2238 		thread_precedence_policy_data_t info;
2239 		__unused kern_return_t kret;
2240 
2241 		bzero(&info, sizeof(info));
2242 		info.importance = 0;
2243 		kret = thread_policy_set(inp->dlth_thread,
2244 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2245 		    THREAD_PRECEDENCE_POLICY_COUNT);
2246 		ASSERT(kret == KERN_SUCCESS);
2247 		/*
2248 		 * We create an affinity set so that the matching workloop
2249 		 * thread or the starter thread (for loopback) can be
2250 		 * scheduled on the same processor set as the input thread.
2251 		 */
2252 		if (net_affinity) {
2253 			struct thread *tp = inp->dlth_thread;
2254 			u_int32_t tag;
2255 			/*
2256 			 * Randomize to reduce the probability
2257 			 * of affinity tag namespace collision.
2258 			 */
2259 			read_frandom(&tag, sizeof(tag));
2260 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2261 				thread_reference(tp);
2262 				inp->dlth_affinity_tag = tag;
2263 				inp->dlth_affinity = TRUE;
2264 			}
2265 		}
2266 	} else if (inp == dlil_main_input_thread) {
2267 		panic_plain("%s: couldn't create main input thread", __func__);
2268 		/* NOTREACHED */
2269 	} else {
2270 		panic_plain("%s: couldn't create %s input thread", __func__,
2271 		    if_name(ifp));
2272 		/* NOTREACHED */
2273 	}
2274 	OSAddAtomic(1, &cur_dlil_input_threads);
2275 
2276 done:
2277 	return error;
2278 }
2279 
2280 #if TEST_INPUT_THREAD_TERMINATION
2281 static int
2282 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2283 {
2284 #pragma unused(arg1, arg2)
2285 	uint32_t i;
2286 	int err;
2287 
2288 	i = if_input_thread_termination_spin;
2289 
2290 	err = sysctl_handle_int(oidp, &i, 0, req);
2291 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2292 		return err;
2293 	}
2294 
2295 	if (net_rxpoll == 0) {
2296 		return ENXIO;
2297 	}
2298 
2299 	if_input_thread_termination_spin = i;
2300 	return err;
2301 }
2302 #endif /* TEST_INPUT_THREAD_TERMINATION */
2303 
2304 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2305 dlil_clean_threading_info(struct dlil_threading_info *inp)
2306 {
2307 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2308 	lck_grp_free(inp->dlth_lock_grp);
2309 	inp->dlth_lock_grp = NULL;
2310 
2311 	inp->dlth_flags = 0;
2312 	inp->dlth_wtot = 0;
2313 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2314 	inp->dlth_ifp = NULL;
2315 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2316 	qlimit(&inp->dlth_pkts) = 0;
2317 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2318 
2319 	VERIFY(!inp->dlth_affinity);
2320 	inp->dlth_thread = THREAD_NULL;
2321 	inp->dlth_strategy = NULL;
2322 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2323 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2324 	VERIFY(inp->dlth_affinity_tag == 0);
2325 #if IFNET_INPUT_SANITY_CHK
2326 	inp->dlth_pkts_cnt = 0;
2327 #endif /* IFNET_INPUT_SANITY_CHK */
2328 }
2329 
2330 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2331 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2332 {
2333 	struct ifnet *ifp = inp->dlth_ifp;
2334 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2335 
2336 	VERIFY(current_thread() == inp->dlth_thread);
2337 	VERIFY(inp != dlil_main_input_thread);
2338 
2339 	OSAddAtomic(-1, &cur_dlil_input_threads);
2340 
2341 #if TEST_INPUT_THREAD_TERMINATION
2342 	{ /* do something useless that won't get optimized away */
2343 		uint32_t        v = 1;
2344 		for (uint32_t i = 0;
2345 		    i < if_input_thread_termination_spin;
2346 		    i++) {
2347 			v = (i + 1) * v;
2348 		}
2349 		DLIL_PRINTF("the value is %d\n", v);
2350 	}
2351 #endif /* TEST_INPUT_THREAD_TERMINATION */
2352 
2353 	lck_mtx_lock_spin(&inp->dlth_lock);
2354 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2355 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2356 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2357 	wakeup_one((caddr_t)&inp->dlth_flags);
2358 	lck_mtx_unlock(&inp->dlth_lock);
2359 
2360 	/* free up pending packets */
2361 	if (pkt.cp_mbuf != NULL) {
2362 		mbuf_freem_list(pkt.cp_mbuf);
2363 	}
2364 
2365 	/* for the extra refcnt from kernel_thread_start() */
2366 	thread_deallocate(current_thread());
2367 
2368 	if (dlil_verbose) {
2369 		DLIL_PRINTF("%s: input thread terminated\n",
2370 		    if_name(ifp));
2371 	}
2372 
2373 	/* this is the end */
2374 	thread_terminate(current_thread());
2375 	/* NOTREACHED */
2376 }
2377 
2378 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2379 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2380 {
2381 	thread_affinity_policy_data_t policy;
2382 
2383 	bzero(&policy, sizeof(policy));
2384 	policy.affinity_tag = tag;
2385 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2386 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2387 }
2388 
2389 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2390 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2391 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2392     enum net_filter_event_subsystems state)
2393 {
2394 	if (state == 0) {
2395 		if_enable_fsw_transport_netagent = 1;
2396 	} else {
2397 		if_enable_fsw_transport_netagent = 0;
2398 	}
2399 	kern_nexus_update_netagents();
2400 }
2401 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2402 
2403 void
dlil_init(void)2404 dlil_init(void)
2405 {
2406 	thread_t thread = THREAD_NULL;
2407 
2408 	/*
2409 	 * The following fields must be 64-bit aligned for atomic operations.
2410 	 */
2411 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2412 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2413 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2414 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2415 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2416 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2417 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2418 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2419 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2420 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2421 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2422 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2423 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2424 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2425 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2426 
2427 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2428 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2429 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2430 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2431 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2432 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2433 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2434 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2435 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2436 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2437 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2438 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2439 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2440 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2441 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2442 
2443 	/*
2444 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2445 	 */
2446 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2447 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2448 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2449 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2450 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2451 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2452 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2453 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2454 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2455 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2456 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2457 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2458 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2459 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2460 
2461 	/*
2462 	 * ... as well as the mbuf checksum flags counterparts.
2463 	 */
2464 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2465 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2466 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2467 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2468 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2469 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2470 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2471 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2472 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2473 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2474 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2475 
2476 	/*
2477 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2478 	 */
2479 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2480 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2481 
2482 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2483 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2484 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2485 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2486 
2487 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2488 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2489 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2490 
2491 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2492 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2493 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2494 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2495 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2496 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2497 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2498 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2499 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2500 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2501 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2502 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2503 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2504 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2505 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2506 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2507 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2508 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2509 
2510 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2511 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2512 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2513 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2514 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2515 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2516 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2517 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2518 	_CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2519 
2520 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2521 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2522 
2523 	PE_parse_boot_argn("net_affinity", &net_affinity,
2524 	    sizeof(net_affinity));
2525 
2526 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2527 
2528 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2529 
2530 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2531 
2532 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2533 
2534 	VERIFY(dlil_pending_thread_cnt == 0);
2535 #if SKYWALK
2536 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2537 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2538 	boolean_t enable_fsw_netagent =
2539 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2540 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2541 
2542 	/*
2543 	 * Check the device tree to see if Skywalk netagent has been explicitly
2544 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2545 	 * Note that the property is a 0-length key, and so checking for the
2546 	 * presence itself is enough (no need to check for the actual value of
2547 	 * the retrieved variable.)
2548 	 */
2549 	pe_enable_fsw_transport_netagent =
2550 	    PE_get_default("kern.skywalk_netagent_enable",
2551 	    &pe_enable_fsw_transport_netagent,
2552 	    sizeof(pe_enable_fsw_transport_netagent));
2553 	pe_disable_fsw_transport_netagent =
2554 	    PE_get_default("kern.skywalk_netagent_disable",
2555 	    &pe_disable_fsw_transport_netagent,
2556 	    sizeof(pe_disable_fsw_transport_netagent));
2557 
2558 	/*
2559 	 * These two are mutually exclusive, i.e. they both can be absent,
2560 	 * but only one can be present at a time, and so we assert to make
2561 	 * sure it is correct.
2562 	 */
2563 	VERIFY((!pe_enable_fsw_transport_netagent &&
2564 	    !pe_disable_fsw_transport_netagent) ||
2565 	    (pe_enable_fsw_transport_netagent ^
2566 	    pe_disable_fsw_transport_netagent));
2567 
2568 	if (pe_enable_fsw_transport_netagent) {
2569 		kprintf("SK: netagent is enabled via an override for "
2570 		    "this platform\n");
2571 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2572 	} else if (pe_disable_fsw_transport_netagent) {
2573 		kprintf("SK: netagent is disabled via an override for "
2574 		    "this platform\n");
2575 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2576 	} else {
2577 		kprintf("SK: netagent is %s by default for this platform\n",
2578 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2579 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2580 	}
2581 
2582 	/*
2583 	 * Now see if there's a boot-arg override.
2584 	 */
2585 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2586 	    sizeof(if_attach_nx));
2587 	if_enable_fsw_transport_netagent =
2588 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2589 
2590 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2591 
2592 	if (pe_disable_fsw_transport_netagent &&
2593 	    if_enable_fsw_transport_netagent) {
2594 		kprintf("SK: netagent is force-enabled\n");
2595 	} else if (!pe_disable_fsw_transport_netagent &&
2596 	    !if_enable_fsw_transport_netagent) {
2597 		kprintf("SK: netagent is force-disabled\n");
2598 	}
2599 #ifdef XNU_TARGET_OS_OSX
2600 	if (if_enable_fsw_transport_netagent) {
2601 		net_filter_event_register(dlil_filter_event);
2602 	}
2603 #endif /* XNU_TARGET_OS_OSX */
2604 
2605 #if (DEVELOPMENT || DEBUG)
2606 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2607 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2608 #endif /* (DEVELOPMENT || DEBUG) */
2609 
2610 #endif /* SKYWALK */
2611 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2612 	    sizeof(struct dlil_ifnet_dbg);
2613 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2614 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2615 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2616 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2617 
2618 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2619 	/* Enforce 64-bit alignment for tcpstat_local structure */
2620 	dlif_tcpstat_bufsize =
2621 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2622 	dlif_tcpstat_bufsize = (uint32_t)
2623 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2624 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2625 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2626 
2627 	dlif_udpstat_size = sizeof(struct udpstat_local);
2628 	/* Enforce 64-bit alignment for udpstat_local structure */
2629 	dlif_udpstat_bufsize =
2630 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2631 	dlif_udpstat_bufsize = (uint32_t)
2632 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2633 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2634 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2635 
2636 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2637 
2638 	TAILQ_INIT(&dlil_ifnet_head);
2639 	TAILQ_INIT(&ifnet_head);
2640 	TAILQ_INIT(&ifnet_detaching_head);
2641 	TAILQ_INIT(&ifnet_ordered_head);
2642 
2643 	/* Initialize interface address subsystem */
2644 	ifa_init();
2645 
2646 #if PF
2647 	/* Initialize the packet filter */
2648 	pfinit();
2649 #endif /* PF */
2650 
2651 	/* Initialize queue algorithms */
2652 	classq_init();
2653 
2654 	/* Initialize packet schedulers */
2655 	pktsched_init();
2656 
2657 	/* Initialize flow advisory subsystem */
2658 	flowadv_init();
2659 
2660 	/* Initialize the pktap virtual interface */
2661 	pktap_init();
2662 
2663 	/* Initialize the service class to dscp map */
2664 	net_qos_map_init();
2665 
2666 	/* Initialize the interface low power mode event handler */
2667 	if_low_power_evhdlr_init();
2668 
2669 	/* Initialize the interface offload port list subsystem */
2670 	if_ports_used_init();
2671 
2672 #if DEBUG || DEVELOPMENT
2673 	/* Run self-tests */
2674 	dlil_verify_sum16();
2675 #endif /* DEBUG || DEVELOPMENT */
2676 
2677 	/*
2678 	 * Create and start up the main DLIL input thread and the interface
2679 	 * detacher threads once everything is initialized.
2680 	 */
2681 	dlil_incr_pending_thread_count();
2682 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2683 
2684 	/*
2685 	 * Create ifnet detacher thread.
2686 	 * When an interface gets detached, part of the detach processing
2687 	 * is delayed. The interface is added to delayed detach list
2688 	 * and this thread is woken up to call ifnet_detach_final
2689 	 * on these interfaces.
2690 	 */
2691 	dlil_incr_pending_thread_count();
2692 	if (kernel_thread_start(ifnet_detacher_thread_func,
2693 	    NULL, &thread) != KERN_SUCCESS) {
2694 		panic_plain("%s: couldn't create detacher thread", __func__);
2695 		/* NOTREACHED */
2696 	}
2697 	thread_deallocate(thread);
2698 
2699 	/*
2700 	 * Wait for the created kernel threads for dlil to get
2701 	 * scheduled and run at least once before we proceed
2702 	 */
2703 	lck_mtx_lock(&dlil_thread_sync_lock);
2704 	while (dlil_pending_thread_cnt != 0) {
2705 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2706 		    "threads to get scheduled at least once.\n", __func__);
2707 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2708 		    (PZERO - 1), __func__, NULL);
2709 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2710 	}
2711 	lck_mtx_unlock(&dlil_thread_sync_lock);
2712 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2713 	    "scheduled at least once. Proceeding.\n", __func__);
2714 }
2715 
2716 static void
if_flt_monitor_busy(struct ifnet * ifp)2717 if_flt_monitor_busy(struct ifnet *ifp)
2718 {
2719 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2720 
2721 	++ifp->if_flt_busy;
2722 	VERIFY(ifp->if_flt_busy != 0);
2723 }
2724 
2725 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2726 if_flt_monitor_unbusy(struct ifnet *ifp)
2727 {
2728 	if_flt_monitor_leave(ifp);
2729 }
2730 
2731 static void
if_flt_monitor_enter(struct ifnet * ifp)2732 if_flt_monitor_enter(struct ifnet *ifp)
2733 {
2734 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2735 
2736 	while (ifp->if_flt_busy) {
2737 		++ifp->if_flt_waiters;
2738 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2739 		    (PZERO - 1), "if_flt_monitor", NULL);
2740 	}
2741 	if_flt_monitor_busy(ifp);
2742 }
2743 
2744 static void
if_flt_monitor_leave(struct ifnet * ifp)2745 if_flt_monitor_leave(struct ifnet *ifp)
2746 {
2747 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2748 
2749 	VERIFY(ifp->if_flt_busy != 0);
2750 	--ifp->if_flt_busy;
2751 
2752 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2753 		ifp->if_flt_waiters = 0;
2754 		wakeup(&ifp->if_flt_head);
2755 	}
2756 }
2757 
2758 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2759 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2760     interface_filter_t *filter_ref, u_int32_t flags)
2761 {
2762 	int retval = 0;
2763 	struct ifnet_filter *filter = NULL;
2764 
2765 	ifnet_head_lock_shared();
2766 
2767 	/* Check that the interface is in the global list */
2768 	if (!ifnet_lookup(ifp)) {
2769 		retval = ENXIO;
2770 		goto done;
2771 	}
2772 	if (!ifnet_is_attached(ifp, 1)) {
2773 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2774 		    __func__, if_name(ifp));
2775 		retval = ENXIO;
2776 		goto done;
2777 	}
2778 
2779 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2780 
2781 	/* refcnt held above during lookup */
2782 	filter->filt_flags = flags;
2783 	filter->filt_ifp = ifp;
2784 	filter->filt_cookie = if_filter->iff_cookie;
2785 	filter->filt_name = if_filter->iff_name;
2786 	filter->filt_protocol = if_filter->iff_protocol;
2787 	/*
2788 	 * Do not install filter callbacks for internal coproc interface
2789 	 */
2790 	if (!IFNET_IS_INTCOPROC(ifp)) {
2791 		filter->filt_input = if_filter->iff_input;
2792 		filter->filt_output = if_filter->iff_output;
2793 		filter->filt_event = if_filter->iff_event;
2794 		filter->filt_ioctl = if_filter->iff_ioctl;
2795 	}
2796 	filter->filt_detached = if_filter->iff_detached;
2797 
2798 	lck_mtx_lock(&ifp->if_flt_lock);
2799 	if_flt_monitor_enter(ifp);
2800 
2801 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2802 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2803 
2804 	*filter_ref = filter;
2805 
2806 	/*
2807 	 * Bump filter count and route_generation ID to let TCP
2808 	 * know it shouldn't do TSO on this connection
2809 	 */
2810 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2811 		ifnet_filter_update_tso(ifp, TRUE);
2812 	}
2813 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2814 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2815 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2816 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2817 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2818 	} else {
2819 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2820 	}
2821 	if_flt_monitor_leave(ifp);
2822 	lck_mtx_unlock(&ifp->if_flt_lock);
2823 
2824 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2825 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2826 	    net_check_compatible_if_filter(NULL));
2827 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2828 
2829 	if (dlil_verbose) {
2830 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2831 		    if_filter->iff_name);
2832 	}
2833 	ifnet_decr_iorefcnt(ifp);
2834 
2835 done:
2836 	ifnet_head_done();
2837 	if (retval != 0 && ifp != NULL) {
2838 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2839 		    if_name(ifp), if_filter->iff_name, retval);
2840 	}
2841 	if (retval != 0 && filter != NULL) {
2842 		zfree(dlif_filt_zone, filter);
2843 	}
2844 
2845 	return retval;
2846 }
2847 
2848 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2849 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2850 {
2851 	int retval = 0;
2852 
2853 	if (detached == 0) {
2854 		ifnet_t ifp = NULL;
2855 
2856 		ifnet_head_lock_shared();
2857 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2858 			interface_filter_t entry = NULL;
2859 
2860 			lck_mtx_lock(&ifp->if_flt_lock);
2861 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2862 				if (entry != filter || entry->filt_skip) {
2863 					continue;
2864 				}
2865 				/*
2866 				 * We've found a match; since it's possible
2867 				 * that the thread gets blocked in the monitor,
2868 				 * we do the lock dance.  Interface should
2869 				 * not be detached since we still have a use
2870 				 * count held during filter attach.
2871 				 */
2872 				entry->filt_skip = 1;   /* skip input/output */
2873 				lck_mtx_unlock(&ifp->if_flt_lock);
2874 				ifnet_head_done();
2875 
2876 				lck_mtx_lock(&ifp->if_flt_lock);
2877 				if_flt_monitor_enter(ifp);
2878 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2879 				    LCK_MTX_ASSERT_OWNED);
2880 
2881 				/* Remove the filter from the list */
2882 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2883 				    filt_next);
2884 
2885 				if (dlil_verbose) {
2886 					DLIL_PRINTF("%s: %s filter detached\n",
2887 					    if_name(ifp), filter->filt_name);
2888 				}
2889 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2890 					VERIFY(ifp->if_flt_non_os_count != 0);
2891 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2892 				}
2893 				/*
2894 				 * Decrease filter count and route_generation
2895 				 * ID to let TCP know it should reevalute doing
2896 				 * TSO or not.
2897 				 */
2898 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2899 					ifnet_filter_update_tso(ifp, FALSE);
2900 				}
2901 				if_flt_monitor_leave(ifp);
2902 				lck_mtx_unlock(&ifp->if_flt_lock);
2903 				goto destroy;
2904 			}
2905 			lck_mtx_unlock(&ifp->if_flt_lock);
2906 		}
2907 		ifnet_head_done();
2908 
2909 		/* filter parameter is not a valid filter ref */
2910 		retval = EINVAL;
2911 		goto done;
2912 	} else {
2913 		struct ifnet *ifp = filter->filt_ifp;
2914 		/*
2915 		 * Here we are called from ifnet_detach_final(); the
2916 		 * caller had emptied if_flt_head and we're doing an
2917 		 * implicit filter detach because the interface is
2918 		 * about to go away.  Make sure to adjust the counters
2919 		 * in this case.  We don't need the protection of the
2920 		 * filter monitor since we're called as part of the
2921 		 * final detach in the context of the detacher thread.
2922 		 */
2923 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2924 			VERIFY(ifp->if_flt_non_os_count != 0);
2925 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2926 		}
2927 		/*
2928 		 * Decrease filter count and route_generation
2929 		 * ID to let TCP know it should reevalute doing
2930 		 * TSO or not.
2931 		 */
2932 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2933 			ifnet_filter_update_tso(ifp, FALSE);
2934 		}
2935 	}
2936 
2937 	if (dlil_verbose) {
2938 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2939 	}
2940 
2941 destroy:
2942 
2943 	/* Call the detached function if there is one */
2944 	if (filter->filt_detached) {
2945 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2946 	}
2947 
2948 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2949 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2950 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2951 	}
2952 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2953 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2954 	    net_check_compatible_if_filter(NULL));
2955 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2956 
2957 	/* Free the filter */
2958 	zfree(dlif_filt_zone, filter);
2959 	filter = NULL;
2960 done:
2961 	if (retval != 0 && filter != NULL) {
2962 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2963 		    filter->filt_name, retval);
2964 	}
2965 
2966 	return retval;
2967 }
2968 
2969 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2970 dlil_detach_filter(interface_filter_t filter)
2971 {
2972 	if (filter == NULL) {
2973 		return;
2974 	}
2975 	dlil_detach_filter_internal(filter, 0);
2976 }
2977 
2978 __private_extern__ boolean_t
dlil_has_ip_filter(void)2979 dlil_has_ip_filter(void)
2980 {
2981 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2982 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2983 	return has_filter;
2984 }
2985 
2986 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2987 dlil_has_if_filter(struct ifnet *ifp)
2988 {
2989 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2990 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2991 	return has_filter;
2992 }
2993 
2994 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2995 dlil_input_wakeup(struct dlil_threading_info *inp)
2996 {
2997 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2998 
2999 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3000 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3001 		inp->dlth_wtot++;
3002 		wakeup_one((caddr_t)&inp->dlth_flags);
3003 	}
3004 }
3005 
3006 __attribute__((noreturn))
3007 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3008 dlil_main_input_thread_func(void *v, wait_result_t w)
3009 {
3010 #pragma unused(w)
3011 	struct dlil_threading_info *inp = v;
3012 
3013 	VERIFY(inp == dlil_main_input_thread);
3014 	VERIFY(inp->dlth_ifp == NULL);
3015 	VERIFY(current_thread() == inp->dlth_thread);
3016 
3017 	lck_mtx_lock(&inp->dlth_lock);
3018 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3019 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3020 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3021 	/* wake up once to get out of embryonic state */
3022 	dlil_input_wakeup(inp);
3023 	lck_mtx_unlock(&inp->dlth_lock);
3024 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3025 	/* NOTREACHED */
3026 	__builtin_unreachable();
3027 }
3028 
3029 /*
3030  * Main input thread:
3031  *
3032  *   a) handles all inbound packets for lo0
3033  *   b) handles all inbound packets for interfaces with no dedicated
3034  *	input thread (e.g. anything but Ethernet/PDP or those that support
3035  *	opportunistic polling.)
3036  *   c) protocol registrations
3037  *   d) packet injections
3038  */
3039 __attribute__((noreturn))
3040 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3041 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3042 {
3043 	struct dlil_main_threading_info *inpm = v;
3044 	struct dlil_threading_info *inp = v;
3045 
3046 	/* main input thread is uninterruptible */
3047 	VERIFY(wres != THREAD_INTERRUPTED);
3048 	lck_mtx_lock_spin(&inp->dlth_lock);
3049 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3050 	    DLIL_INPUT_RUNNING)));
3051 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3052 
3053 	while (1) {
3054 		struct mbuf *m = NULL, *m_loop = NULL;
3055 		u_int32_t m_cnt, m_cnt_loop;
3056 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3057 		boolean_t proto_req;
3058 		boolean_t embryonic;
3059 
3060 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3061 
3062 		if (__improbable(embryonic =
3063 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3064 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3065 		}
3066 
3067 		proto_req = (inp->dlth_flags &
3068 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3069 
3070 		/* Packets for non-dedicated interfaces other than lo0 */
3071 		m_cnt = qlen(&inp->dlth_pkts);
3072 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3073 		m = pkt.cp_mbuf;
3074 
3075 		/* Packets exclusive to lo0 */
3076 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3077 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3078 		m_loop = pkt.cp_mbuf;
3079 
3080 		inp->dlth_wtot = 0;
3081 
3082 		lck_mtx_unlock(&inp->dlth_lock);
3083 
3084 		if (__improbable(embryonic)) {
3085 			dlil_decr_pending_thread_count();
3086 		}
3087 
3088 		/*
3089 		 * NOTE warning %%% attention !!!!
3090 		 * We should think about putting some thread starvation
3091 		 * safeguards if we deal with long chains of packets.
3092 		 */
3093 		if (__probable(m_loop != NULL)) {
3094 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3095 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3096 		}
3097 
3098 		if (__probable(m != NULL)) {
3099 			dlil_input_packet_list_extended(NULL, m,
3100 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3101 		}
3102 
3103 		if (__improbable(proto_req)) {
3104 			proto_input_run();
3105 		}
3106 
3107 		lck_mtx_lock_spin(&inp->dlth_lock);
3108 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3109 		/* main input thread cannot be terminated */
3110 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3111 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3112 			break;
3113 		}
3114 	}
3115 
3116 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3117 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3118 	lck_mtx_unlock(&inp->dlth_lock);
3119 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3120 
3121 	VERIFY(0);      /* we should never get here */
3122 	/* NOTREACHED */
3123 	__builtin_unreachable();
3124 }
3125 
3126 /*
3127  * Input thread for interfaces with legacy input model.
3128  */
3129 __attribute__((noreturn))
3130 static void
dlil_input_thread_func(void * v,wait_result_t w)3131 dlil_input_thread_func(void *v, wait_result_t w)
3132 {
3133 #pragma unused(w)
3134 	char thread_name[MAXTHREADNAMESIZE];
3135 	struct dlil_threading_info *inp = v;
3136 	struct ifnet *ifp = inp->dlth_ifp;
3137 
3138 	VERIFY(inp != dlil_main_input_thread);
3139 	VERIFY(ifp != NULL);
3140 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3141 	    !(ifp->if_xflags & IFXF_LEGACY));
3142 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3143 	    !(ifp->if_xflags & IFXF_LEGACY));
3144 	VERIFY(current_thread() == inp->dlth_thread);
3145 
3146 	/* construct the name for this thread, and then apply it */
3147 	bzero(thread_name, sizeof(thread_name));
3148 	(void) snprintf(thread_name, sizeof(thread_name),
3149 	    "dlil_input_%s", ifp->if_xname);
3150 	thread_set_thread_name(inp->dlth_thread, thread_name);
3151 
3152 	lck_mtx_lock(&inp->dlth_lock);
3153 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3154 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3155 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3156 	/* wake up once to get out of embryonic state */
3157 	dlil_input_wakeup(inp);
3158 	lck_mtx_unlock(&inp->dlth_lock);
3159 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3160 	/* NOTREACHED */
3161 	__builtin_unreachable();
3162 }
3163 
3164 __attribute__((noreturn))
3165 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3166 dlil_input_thread_cont(void *v, wait_result_t wres)
3167 {
3168 	struct dlil_threading_info *inp = v;
3169 	struct ifnet *ifp = inp->dlth_ifp;
3170 
3171 	lck_mtx_lock_spin(&inp->dlth_lock);
3172 	if (__improbable(wres == THREAD_INTERRUPTED ||
3173 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3174 		goto terminate;
3175 	}
3176 
3177 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3178 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3179 
3180 	while (1) {
3181 		struct mbuf *m = NULL;
3182 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3183 		boolean_t notify = FALSE;
3184 		boolean_t embryonic;
3185 		u_int32_t m_cnt;
3186 
3187 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3188 
3189 		if (__improbable(embryonic =
3190 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3191 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3192 		}
3193 
3194 		/*
3195 		 * Protocol registration and injection must always use
3196 		 * the main input thread; in theory the latter can utilize
3197 		 * the corresponding input thread where the packet arrived
3198 		 * on, but that requires our knowing the interface in advance
3199 		 * (and the benefits might not worth the trouble.)
3200 		 */
3201 		VERIFY(!(inp->dlth_flags &
3202 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3203 
3204 		/* Packets for this interface */
3205 		m_cnt = qlen(&inp->dlth_pkts);
3206 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3207 		m = pkt.cp_mbuf;
3208 
3209 		inp->dlth_wtot = 0;
3210 
3211 #if SKYWALK
3212 		/*
3213 		 * If this interface is attached to a netif nexus,
3214 		 * the stats are already incremented there; otherwise
3215 		 * do it here.
3216 		 */
3217 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3218 #endif /* SKYWALK */
3219 		notify = dlil_input_stats_sync(ifp, inp);
3220 
3221 		lck_mtx_unlock(&inp->dlth_lock);
3222 
3223 		if (__improbable(embryonic)) {
3224 			ifnet_decr_pending_thread_count(ifp);
3225 		}
3226 
3227 		if (__improbable(notify)) {
3228 			ifnet_notify_data_threshold(ifp);
3229 		}
3230 
3231 		/*
3232 		 * NOTE warning %%% attention !!!!
3233 		 * We should think about putting some thread starvation
3234 		 * safeguards if we deal with long chains of packets.
3235 		 */
3236 		if (__probable(m != NULL)) {
3237 			dlil_input_packet_list_extended(NULL, m,
3238 			    m_cnt, ifp->if_poll_mode);
3239 		}
3240 
3241 		lck_mtx_lock_spin(&inp->dlth_lock);
3242 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3243 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3244 		    DLIL_INPUT_TERMINATE))) {
3245 			break;
3246 		}
3247 	}
3248 
3249 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3250 
3251 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3252 terminate:
3253 		lck_mtx_unlock(&inp->dlth_lock);
3254 		dlil_terminate_input_thread(inp);
3255 		/* NOTREACHED */
3256 	} else {
3257 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3258 		lck_mtx_unlock(&inp->dlth_lock);
3259 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3260 		/* NOTREACHED */
3261 	}
3262 
3263 	VERIFY(0);      /* we should never get here */
3264 	/* NOTREACHED */
3265 	__builtin_unreachable();
3266 }
3267 
3268 /*
3269  * Input thread for interfaces with opportunistic polling input model.
3270  */
3271 __attribute__((noreturn))
3272 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3273 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3274 {
3275 #pragma unused(w)
3276 	char thread_name[MAXTHREADNAMESIZE];
3277 	struct dlil_threading_info *inp = v;
3278 	struct ifnet *ifp = inp->dlth_ifp;
3279 
3280 	VERIFY(inp != dlil_main_input_thread);
3281 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3282 	    (ifp->if_xflags & IFXF_LEGACY));
3283 	VERIFY(current_thread() == inp->dlth_thread);
3284 
3285 	/* construct the name for this thread, and then apply it */
3286 	bzero(thread_name, sizeof(thread_name));
3287 	(void) snprintf(thread_name, sizeof(thread_name),
3288 	    "dlil_input_poll_%s", ifp->if_xname);
3289 	thread_set_thread_name(inp->dlth_thread, thread_name);
3290 
3291 	lck_mtx_lock(&inp->dlth_lock);
3292 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3293 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3294 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3295 	/* wake up once to get out of embryonic state */
3296 	dlil_input_wakeup(inp);
3297 	lck_mtx_unlock(&inp->dlth_lock);
3298 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3299 	/* NOTREACHED */
3300 	__builtin_unreachable();
3301 }
3302 
3303 __attribute__((noreturn))
3304 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3305 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3306 {
3307 	struct dlil_threading_info *inp = v;
3308 	struct ifnet *ifp = inp->dlth_ifp;
3309 	struct timespec ts;
3310 
3311 	lck_mtx_lock_spin(&inp->dlth_lock);
3312 	if (__improbable(wres == THREAD_INTERRUPTED ||
3313 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3314 		goto terminate;
3315 	}
3316 
3317 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3318 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3319 
3320 	while (1) {
3321 		struct mbuf *m = NULL;
3322 		uint32_t m_cnt, poll_req = 0;
3323 		uint64_t m_size = 0;
3324 		ifnet_model_t mode;
3325 		struct timespec now, delta;
3326 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3327 		boolean_t notify;
3328 		boolean_t embryonic;
3329 		uint64_t ival;
3330 
3331 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3332 
3333 		if (__improbable(embryonic =
3334 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3335 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3336 			goto skip;
3337 		}
3338 
3339 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3340 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3341 		}
3342 
3343 		/* Link parameters changed? */
3344 		if (ifp->if_poll_update != 0) {
3345 			ifp->if_poll_update = 0;
3346 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3347 		}
3348 
3349 		/* Current operating mode */
3350 		mode = ifp->if_poll_mode;
3351 
3352 		/*
3353 		 * Protocol registration and injection must always use
3354 		 * the main input thread; in theory the latter can utilize
3355 		 * the corresponding input thread where the packet arrived
3356 		 * on, but that requires our knowing the interface in advance
3357 		 * (and the benefits might not worth the trouble.)
3358 		 */
3359 		VERIFY(!(inp->dlth_flags &
3360 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3361 
3362 		/* Total count of all packets */
3363 		m_cnt = qlen(&inp->dlth_pkts);
3364 
3365 		/* Total bytes of all packets */
3366 		m_size = qsize(&inp->dlth_pkts);
3367 
3368 		/* Packets for this interface */
3369 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3370 		m = pkt.cp_mbuf;
3371 		VERIFY(m != NULL || m_cnt == 0);
3372 
3373 		nanouptime(&now);
3374 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3375 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3376 		}
3377 
3378 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3379 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3380 			u_int32_t ptot, btot;
3381 
3382 			/* Accumulate statistics for current sampling */
3383 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3384 
3385 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3386 				goto skip;
3387 			}
3388 
3389 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3390 
3391 			/* Calculate min/max of inbound bytes */
3392 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3393 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3394 				ifp->if_rxpoll_bmin = btot;
3395 			}
3396 			if (btot > ifp->if_rxpoll_bmax) {
3397 				ifp->if_rxpoll_bmax = btot;
3398 			}
3399 
3400 			/* Calculate EWMA of inbound bytes */
3401 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3402 
3403 			/* Calculate min/max of inbound packets */
3404 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3405 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3406 				ifp->if_rxpoll_pmin = ptot;
3407 			}
3408 			if (ptot > ifp->if_rxpoll_pmax) {
3409 				ifp->if_rxpoll_pmax = ptot;
3410 			}
3411 
3412 			/* Calculate EWMA of inbound packets */
3413 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3414 
3415 			/* Reset sampling statistics */
3416 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3417 
3418 			/* Calculate EWMA of wakeup requests */
3419 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3420 			    if_rxpoll_decay);
3421 			inp->dlth_wtot = 0;
3422 
3423 			if (dlil_verbose) {
3424 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3425 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3426 				}
3427 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3428 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3429 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3430 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3431 					    "limits [%d/%d], wreq avg %d "
3432 					    "limits [%d/%d], bytes avg %d "
3433 					    "limits [%d/%d]\n", if_name(ifp),
3434 					    (ifp->if_poll_mode ==
3435 					    IFNET_MODEL_INPUT_POLL_ON) ?
3436 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3437 					    ifp->if_rxpoll_pmax,
3438 					    ifp->if_rxpoll_plowat,
3439 					    ifp->if_rxpoll_phiwat,
3440 					    ifp->if_rxpoll_wavg,
3441 					    ifp->if_rxpoll_wlowat,
3442 					    ifp->if_rxpoll_whiwat,
3443 					    ifp->if_rxpoll_bavg,
3444 					    ifp->if_rxpoll_blowat,
3445 					    ifp->if_rxpoll_bhiwat);
3446 				}
3447 			}
3448 
3449 			/* Perform mode transition, if necessary */
3450 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3451 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3452 			}
3453 
3454 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3455 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3456 				goto skip;
3457 			}
3458 
3459 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3460 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3461 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3462 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3463 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3464 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3465 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3466 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3467 				mode = IFNET_MODEL_INPUT_POLL_ON;
3468 			}
3469 
3470 			if (mode != ifp->if_poll_mode) {
3471 				ifp->if_poll_mode = mode;
3472 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3473 				poll_req++;
3474 			}
3475 		}
3476 skip:
3477 		notify = dlil_input_stats_sync(ifp, inp);
3478 
3479 		lck_mtx_unlock(&inp->dlth_lock);
3480 
3481 		if (__improbable(embryonic)) {
3482 			ifnet_decr_pending_thread_count(ifp);
3483 		}
3484 
3485 		if (__improbable(notify)) {
3486 			ifnet_notify_data_threshold(ifp);
3487 		}
3488 
3489 		/*
3490 		 * If there's a mode change and interface is still attached,
3491 		 * perform a downcall to the driver for the new mode.  Also
3492 		 * hold an IO refcnt on the interface to prevent it from
3493 		 * being detached (will be release below.)
3494 		 */
3495 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3496 			struct ifnet_model_params p = {
3497 				.model = mode, .reserved = { 0 }
3498 			};
3499 			errno_t err;
3500 
3501 			if (dlil_verbose) {
3502 				DLIL_PRINTF("%s: polling is now %s, "
3503 				    "pkts avg %d max %d limits [%d/%d], "
3504 				    "wreq avg %d limits [%d/%d], "
3505 				    "bytes avg %d limits [%d/%d]\n",
3506 				    if_name(ifp),
3507 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3508 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3509 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3510 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3511 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3512 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3513 				    ifp->if_rxpoll_bhiwat);
3514 			}
3515 
3516 			if ((err = ((*ifp->if_input_ctl)(ifp,
3517 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3518 				DLIL_PRINTF("%s: error setting polling mode "
3519 				    "to %s (%d)\n", if_name(ifp),
3520 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3521 				    "ON" : "OFF", err);
3522 			}
3523 
3524 			switch (mode) {
3525 			case IFNET_MODEL_INPUT_POLL_OFF:
3526 				ifnet_set_poll_cycle(ifp, NULL);
3527 				ifp->if_rxpoll_offreq++;
3528 				if (err != 0) {
3529 					ifp->if_rxpoll_offerr++;
3530 				}
3531 				break;
3532 
3533 			case IFNET_MODEL_INPUT_POLL_ON:
3534 				net_nsectimer(&ival, &ts);
3535 				ifnet_set_poll_cycle(ifp, &ts);
3536 				ifnet_poll(ifp);
3537 				ifp->if_rxpoll_onreq++;
3538 				if (err != 0) {
3539 					ifp->if_rxpoll_onerr++;
3540 				}
3541 				break;
3542 
3543 			default:
3544 				VERIFY(0);
3545 				/* NOTREACHED */
3546 			}
3547 
3548 			/* Release the IO refcnt */
3549 			ifnet_decr_iorefcnt(ifp);
3550 		}
3551 
3552 		/*
3553 		 * NOTE warning %%% attention !!!!
3554 		 * We should think about putting some thread starvation
3555 		 * safeguards if we deal with long chains of packets.
3556 		 */
3557 		if (__probable(m != NULL)) {
3558 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3559 		}
3560 
3561 		lck_mtx_lock_spin(&inp->dlth_lock);
3562 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3563 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3564 		    DLIL_INPUT_TERMINATE))) {
3565 			break;
3566 		}
3567 	}
3568 
3569 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3570 
3571 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3572 terminate:
3573 		lck_mtx_unlock(&inp->dlth_lock);
3574 		dlil_terminate_input_thread(inp);
3575 		/* NOTREACHED */
3576 	} else {
3577 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3578 		lck_mtx_unlock(&inp->dlth_lock);
3579 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3580 		    inp);
3581 		/* NOTREACHED */
3582 	}
3583 
3584 	VERIFY(0);      /* we should never get here */
3585 	/* NOTREACHED */
3586 	__builtin_unreachable();
3587 }
3588 
3589 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3590 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3591 {
3592 	if (p != NULL) {
3593 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3594 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3595 			return EINVAL;
3596 		}
3597 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3598 		    p->packets_lowat >= p->packets_hiwat) {
3599 			return EINVAL;
3600 		}
3601 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3602 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3603 			return EINVAL;
3604 		}
3605 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3606 		    p->bytes_lowat >= p->bytes_hiwat) {
3607 			return EINVAL;
3608 		}
3609 		if (p->interval_time != 0 &&
3610 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3611 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3612 		}
3613 	}
3614 	return 0;
3615 }
3616 
3617 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3618 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3619 {
3620 	u_int64_t sample_holdtime, inbw;
3621 
3622 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3623 		sample_holdtime = 0;    /* polling is disabled */
3624 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3625 		    ifp->if_rxpoll_blowat = 0;
3626 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3627 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3628 		ifp->if_rxpoll_plim = 0;
3629 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3630 	} else {
3631 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3632 		u_int64_t ival;
3633 		unsigned int n, i;
3634 
3635 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3636 			if (inbw < rxpoll_tbl[i].speed) {
3637 				break;
3638 			}
3639 			n = i;
3640 		}
3641 		/* auto-tune if caller didn't specify a value */
3642 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3643 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3644 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3645 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3646 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3647 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3648 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3649 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3650 		plim = ((p == NULL || p->packets_limit == 0 ||
3651 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3652 		ival = ((p == NULL || p->interval_time == 0 ||
3653 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3654 		    if_rxpoll_interval_time : p->interval_time);
3655 
3656 		VERIFY(plowat != 0 && phiwat != 0);
3657 		VERIFY(blowat != 0 && bhiwat != 0);
3658 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3659 
3660 		sample_holdtime = if_rxpoll_sample_holdtime;
3661 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3662 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3663 		ifp->if_rxpoll_plowat = plowat;
3664 		ifp->if_rxpoll_phiwat = phiwat;
3665 		ifp->if_rxpoll_blowat = blowat;
3666 		ifp->if_rxpoll_bhiwat = bhiwat;
3667 		ifp->if_rxpoll_plim = plim;
3668 		ifp->if_rxpoll_ival = ival;
3669 	}
3670 
3671 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3672 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3673 
3674 	if (dlil_verbose) {
3675 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3676 		    "poll interval %llu nsec, pkts per poll %u, "
3677 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3678 		    "bytes limits [%u/%u]\n", if_name(ifp),
3679 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3680 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3681 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3682 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3683 		    ifp->if_rxpoll_bhiwat);
3684 	}
3685 }
3686 
3687 /*
3688  * Must be called on an attached ifnet (caller is expected to check.)
3689  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3690  */
3691 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3692 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3693     boolean_t locked)
3694 {
3695 	errno_t err;
3696 	struct dlil_threading_info *inp;
3697 
3698 	VERIFY(ifp != NULL);
3699 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3700 		return ENXIO;
3701 	}
3702 	err = dlil_rxpoll_validate_params(p);
3703 	if (err != 0) {
3704 		return err;
3705 	}
3706 
3707 	if (!locked) {
3708 		lck_mtx_lock(&inp->dlth_lock);
3709 	}
3710 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3711 	/*
3712 	 * Normally, we'd reset the parameters to the auto-tuned values
3713 	 * if the the input thread detects a change in link rate.  If the
3714 	 * driver provides its own parameters right after a link rate
3715 	 * changes, but before the input thread gets to run, we want to
3716 	 * make sure to keep the driver's values.  Clearing if_poll_update
3717 	 * will achieve that.
3718 	 */
3719 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3720 		ifp->if_poll_update = 0;
3721 	}
3722 	dlil_rxpoll_update_params(ifp, p);
3723 	if (!locked) {
3724 		lck_mtx_unlock(&inp->dlth_lock);
3725 	}
3726 	return 0;
3727 }
3728 
3729 /*
3730  * Must be called on an attached ifnet (caller is expected to check.)
3731  */
3732 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3733 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3734 {
3735 	struct dlil_threading_info *inp;
3736 
3737 	VERIFY(ifp != NULL && p != NULL);
3738 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3739 		return ENXIO;
3740 	}
3741 
3742 	bzero(p, sizeof(*p));
3743 
3744 	lck_mtx_lock(&inp->dlth_lock);
3745 	p->packets_limit = ifp->if_rxpoll_plim;
3746 	p->packets_lowat = ifp->if_rxpoll_plowat;
3747 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3748 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3749 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3750 	p->interval_time = ifp->if_rxpoll_ival;
3751 	lck_mtx_unlock(&inp->dlth_lock);
3752 
3753 	return 0;
3754 }
3755 
3756 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3757 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3758     const struct ifnet_stat_increment_param *s)
3759 {
3760 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3761 }
3762 
3763 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3764 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3765     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3766 {
3767 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3768 }
3769 
3770 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3771 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3772     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3773 {
3774 	return ifnet_input_common(ifp, m_head, m_tail, s,
3775 	           (m_head != NULL), TRUE);
3776 }
3777 
3778 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3779 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3780     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3781 {
3782 	dlil_input_func input_func;
3783 	struct ifnet_stat_increment_param _s;
3784 	u_int32_t m_cnt = 0, m_size = 0;
3785 	struct mbuf *last;
3786 	errno_t err = 0;
3787 
3788 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3789 		if (m_head != NULL) {
3790 			mbuf_freem_list(m_head);
3791 		}
3792 		return EINVAL;
3793 	}
3794 
3795 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3796 	VERIFY(m_tail == NULL || ext);
3797 	VERIFY(s != NULL || !ext);
3798 
3799 	/*
3800 	 * Drop the packet(s) if the parameters are invalid, or if the
3801 	 * interface is no longer attached; else hold an IO refcnt to
3802 	 * prevent it from being detached (will be released below.)
3803 	 */
3804 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3805 		if (m_head != NULL) {
3806 			mbuf_freem_list(m_head);
3807 		}
3808 		return EINVAL;
3809 	}
3810 
3811 	input_func = ifp->if_input_dlil;
3812 	VERIFY(input_func != NULL);
3813 
3814 	if (m_tail == NULL) {
3815 		last = m_head;
3816 		while (m_head != NULL) {
3817 #if IFNET_INPUT_SANITY_CHK
3818 			if (__improbable(dlil_input_sanity_check != 0)) {
3819 				DLIL_INPUT_CHECK(last, ifp);
3820 			}
3821 #endif /* IFNET_INPUT_SANITY_CHK */
3822 			m_cnt++;
3823 			m_size += m_length(last);
3824 			if (mbuf_nextpkt(last) == NULL) {
3825 				break;
3826 			}
3827 			last = mbuf_nextpkt(last);
3828 		}
3829 		m_tail = last;
3830 	} else {
3831 #if IFNET_INPUT_SANITY_CHK
3832 		if (__improbable(dlil_input_sanity_check != 0)) {
3833 			last = m_head;
3834 			while (1) {
3835 				DLIL_INPUT_CHECK(last, ifp);
3836 				m_cnt++;
3837 				m_size += m_length(last);
3838 				if (mbuf_nextpkt(last) == NULL) {
3839 					break;
3840 				}
3841 				last = mbuf_nextpkt(last);
3842 			}
3843 		} else {
3844 			m_cnt = s->packets_in;
3845 			m_size = s->bytes_in;
3846 			last = m_tail;
3847 		}
3848 #else
3849 		m_cnt = s->packets_in;
3850 		m_size = s->bytes_in;
3851 		last = m_tail;
3852 #endif /* IFNET_INPUT_SANITY_CHK */
3853 	}
3854 
3855 	if (last != m_tail) {
3856 		panic_plain("%s: invalid input packet chain for %s, "
3857 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3858 		    m_tail, last);
3859 	}
3860 
3861 	/*
3862 	 * Assert packet count only for the extended variant, for backwards
3863 	 * compatibility, since this came directly from the device driver.
3864 	 * Relax this assertion for input bytes, as the driver may have
3865 	 * included the link-layer headers in the computation; hence
3866 	 * m_size is just an approximation.
3867 	 */
3868 	if (ext && s->packets_in != m_cnt) {
3869 		panic_plain("%s: input packet count mismatch for %s, "
3870 		    "%d instead of %d\n", __func__, if_name(ifp),
3871 		    s->packets_in, m_cnt);
3872 	}
3873 
3874 	if (s == NULL) {
3875 		bzero(&_s, sizeof(_s));
3876 		s = &_s;
3877 	} else {
3878 		_s = *s;
3879 	}
3880 	_s.packets_in = m_cnt;
3881 	_s.bytes_in = m_size;
3882 
3883 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3884 
3885 	if (ifp != lo_ifp) {
3886 		/* Release the IO refcnt */
3887 		ifnet_datamov_end(ifp);
3888 	}
3889 
3890 	return err;
3891 }
3892 
3893 #if SKYWALK
3894 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3895 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3896 {
3897 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3898 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3899 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3900 }
3901 
3902 void
dlil_reset_input_handler(struct ifnet * ifp)3903 dlil_reset_input_handler(struct ifnet *ifp)
3904 {
3905 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3906 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3907 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3908 		;
3909 	}
3910 }
3911 
3912 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3913 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3914 {
3915 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3916 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3917 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3918 }
3919 
3920 void
dlil_reset_output_handler(struct ifnet * ifp)3921 dlil_reset_output_handler(struct ifnet *ifp)
3922 {
3923 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3924 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3925 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3926 		;
3927 	}
3928 }
3929 #endif /* SKYWALK */
3930 
3931 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3932 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3933 {
3934 	return ifp->if_output(ifp, m);
3935 }
3936 
3937 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3938 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3939     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3940     boolean_t poll, struct thread *tp)
3941 {
3942 	struct dlil_threading_info *inp = ifp->if_inp;
3943 
3944 	if (__improbable(inp == NULL)) {
3945 		inp = dlil_main_input_thread;
3946 	}
3947 
3948 #if (DEVELOPMENT || DEBUG)
3949 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3950 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3951 	} else
3952 #endif /* (DEVELOPMENT || DEBUG) */
3953 	{
3954 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3955 	}
3956 }
3957 
3958 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3959 dlil_input_async(struct dlil_threading_info *inp,
3960     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3961     const struct ifnet_stat_increment_param *s, boolean_t poll,
3962     struct thread *tp)
3963 {
3964 	u_int32_t m_cnt = s->packets_in;
3965 	u_int32_t m_size = s->bytes_in;
3966 	boolean_t notify = FALSE;
3967 
3968 	/*
3969 	 * If there is a matching DLIL input thread associated with an
3970 	 * affinity set, associate this thread with the same set.  We
3971 	 * will only do this once.
3972 	 */
3973 	lck_mtx_lock_spin(&inp->dlth_lock);
3974 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3975 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3976 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3977 		u_int32_t tag = inp->dlth_affinity_tag;
3978 
3979 		if (poll) {
3980 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3981 			inp->dlth_poller_thread = tp;
3982 		} else {
3983 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3984 			inp->dlth_driver_thread = tp;
3985 		}
3986 		lck_mtx_unlock(&inp->dlth_lock);
3987 
3988 		/* Associate the current thread with the new affinity tag */
3989 		(void) dlil_affinity_set(tp, tag);
3990 
3991 		/*
3992 		 * Take a reference on the current thread; during detach,
3993 		 * we will need to refer to it in order to tear down its
3994 		 * affinity.
3995 		 */
3996 		thread_reference(tp);
3997 		lck_mtx_lock_spin(&inp->dlth_lock);
3998 	}
3999 
4000 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4001 
4002 	/*
4003 	 * Because of loopbacked multicast we cannot stuff the ifp in
4004 	 * the rcvif of the packet header: loopback (lo0) packets use a
4005 	 * dedicated list so that we can later associate them with lo_ifp
4006 	 * on their way up the stack.  Packets for other interfaces without
4007 	 * dedicated input threads go to the regular list.
4008 	 */
4009 	if (m_head != NULL) {
4010 		classq_pkt_t head, tail;
4011 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4012 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4013 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4014 			struct dlil_main_threading_info *inpm =
4015 			    (struct dlil_main_threading_info *)inp;
4016 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4017 			    m_cnt, m_size);
4018 		} else {
4019 			_addq_multi(&inp->dlth_pkts, &head, &tail,
4020 			    m_cnt, m_size);
4021 		}
4022 	}
4023 
4024 #if IFNET_INPUT_SANITY_CHK
4025 	if (__improbable(dlil_input_sanity_check != 0)) {
4026 		u_int32_t count = 0, size = 0;
4027 		struct mbuf *m0;
4028 
4029 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4030 			size += m_length(m0);
4031 			count++;
4032 		}
4033 
4034 		if (count != m_cnt) {
4035 			panic_plain("%s: invalid total packet count %u "
4036 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4037 			/* NOTREACHED */
4038 			__builtin_unreachable();
4039 		} else if (size != m_size) {
4040 			panic_plain("%s: invalid total packet size %u "
4041 			    "(expected %u)\n", if_name(ifp), size, m_size);
4042 			/* NOTREACHED */
4043 			__builtin_unreachable();
4044 		}
4045 
4046 		inp->dlth_pkts_cnt += m_cnt;
4047 	}
4048 #endif /* IFNET_INPUT_SANITY_CHK */
4049 
4050 	dlil_input_stats_add(s, inp, ifp, poll);
4051 	/*
4052 	 * If we're using the main input thread, synchronize the
4053 	 * stats now since we have the interface context.  All
4054 	 * other cases involving dedicated input threads will
4055 	 * have their stats synchronized there.
4056 	 */
4057 	if (inp == dlil_main_input_thread) {
4058 		notify = dlil_input_stats_sync(ifp, inp);
4059 	}
4060 
4061 	dlil_input_wakeup(inp);
4062 	lck_mtx_unlock(&inp->dlth_lock);
4063 
4064 	if (notify) {
4065 		ifnet_notify_data_threshold(ifp);
4066 	}
4067 
4068 	return 0;
4069 }
4070 
4071 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4072 dlil_input_sync(struct dlil_threading_info *inp,
4073     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4074     const struct ifnet_stat_increment_param *s, boolean_t poll,
4075     struct thread *tp)
4076 {
4077 #pragma unused(tp)
4078 	u_int32_t m_cnt = s->packets_in;
4079 	u_int32_t m_size = s->bytes_in;
4080 	boolean_t notify = FALSE;
4081 	classq_pkt_t head, tail;
4082 
4083 	ASSERT(inp != dlil_main_input_thread);
4084 
4085 	/* XXX: should we just assert instead? */
4086 	if (__improbable(m_head == NULL)) {
4087 		return 0;
4088 	}
4089 
4090 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4091 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4092 
4093 	lck_mtx_lock_spin(&inp->dlth_lock);
4094 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4095 
4096 #if IFNET_INPUT_SANITY_CHK
4097 	if (__improbable(dlil_input_sanity_check != 0)) {
4098 		u_int32_t count = 0, size = 0;
4099 		struct mbuf *m0;
4100 
4101 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4102 			size += m_length(m0);
4103 			count++;
4104 		}
4105 
4106 		if (count != m_cnt) {
4107 			panic_plain("%s: invalid total packet count %u "
4108 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4109 			/* NOTREACHED */
4110 			__builtin_unreachable();
4111 		} else if (size != m_size) {
4112 			panic_plain("%s: invalid total packet size %u "
4113 			    "(expected %u)\n", if_name(ifp), size, m_size);
4114 			/* NOTREACHED */
4115 			__builtin_unreachable();
4116 		}
4117 
4118 		inp->dlth_pkts_cnt += m_cnt;
4119 	}
4120 #endif /* IFNET_INPUT_SANITY_CHK */
4121 
4122 	dlil_input_stats_add(s, inp, ifp, poll);
4123 
4124 	m_cnt = qlen(&inp->dlth_pkts);
4125 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4126 
4127 #if SKYWALK
4128 	/*
4129 	 * If this interface is attached to a netif nexus,
4130 	 * the stats are already incremented there; otherwise
4131 	 * do it here.
4132 	 */
4133 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4134 #endif /* SKYWALK */
4135 	notify = dlil_input_stats_sync(ifp, inp);
4136 
4137 	lck_mtx_unlock(&inp->dlth_lock);
4138 
4139 	if (notify) {
4140 		ifnet_notify_data_threshold(ifp);
4141 	}
4142 
4143 	/*
4144 	 * NOTE warning %%% attention !!!!
4145 	 * We should think about putting some thread starvation
4146 	 * safeguards if we deal with long chains of packets.
4147 	 */
4148 	if (head.cp_mbuf != NULL) {
4149 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4150 		    m_cnt, ifp->if_poll_mode);
4151 	}
4152 
4153 	return 0;
4154 }
4155 
4156 #if SKYWALK
4157 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4158 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4159 {
4160 	return atomic_test_set_ptr(&ifp->if_output,
4161 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4162 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4163 }
4164 
4165 void
ifnet_reset_output_handler(struct ifnet * ifp)4166 ifnet_reset_output_handler(struct ifnet *ifp)
4167 {
4168 	while (!atomic_test_set_ptr(&ifp->if_output,
4169 	    ptrauth_nop_cast(void *, ifp->if_output),
4170 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4171 		;
4172 	}
4173 }
4174 
4175 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4176 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4177 {
4178 	return atomic_test_set_ptr(&ifp->if_start,
4179 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4180 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4181 }
4182 
4183 void
ifnet_reset_start_handler(struct ifnet * ifp)4184 ifnet_reset_start_handler(struct ifnet *ifp)
4185 {
4186 	while (!atomic_test_set_ptr(&ifp->if_start,
4187 	    ptrauth_nop_cast(void *, ifp->if_start),
4188 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4189 		;
4190 	}
4191 }
4192 #endif /* SKYWALK */
4193 
4194 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4195 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4196 {
4197 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4198 		return;
4199 	}
4200 	/*
4201 	 * If the starter thread is inactive, signal it to do work,
4202 	 * unless the interface is being flow controlled from below,
4203 	 * e.g. a virtual interface being flow controlled by a real
4204 	 * network interface beneath it, or it's been disabled via
4205 	 * a call to ifnet_disable_output().
4206 	 */
4207 	lck_mtx_lock_spin(&ifp->if_start_lock);
4208 	if (resetfc) {
4209 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4210 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4211 		lck_mtx_unlock(&ifp->if_start_lock);
4212 		return;
4213 	}
4214 	ifp->if_start_req++;
4215 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4216 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4217 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4218 	    ifp->if_start_delayed == 0)) {
4219 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4220 	}
4221 	lck_mtx_unlock(&ifp->if_start_lock);
4222 }
4223 
4224 void
ifnet_start(struct ifnet * ifp)4225 ifnet_start(struct ifnet *ifp)
4226 {
4227 	ifnet_start_common(ifp, FALSE);
4228 }
4229 
4230 __attribute__((noreturn))
4231 static void
ifnet_start_thread_func(void * v,wait_result_t w)4232 ifnet_start_thread_func(void *v, wait_result_t w)
4233 {
4234 #pragma unused(w)
4235 	struct ifnet *ifp = v;
4236 	char thread_name[MAXTHREADNAMESIZE];
4237 
4238 	/* Construct the name for this thread, and then apply it. */
4239 	bzero(thread_name, sizeof(thread_name));
4240 	(void) snprintf(thread_name, sizeof(thread_name),
4241 	    "ifnet_start_%s", ifp->if_xname);
4242 #if SKYWALK
4243 	/* override name for native Skywalk interface */
4244 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4245 		(void) snprintf(thread_name, sizeof(thread_name),
4246 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4247 	}
4248 #endif /* SKYWALK */
4249 	ASSERT(ifp->if_start_thread == current_thread());
4250 	thread_set_thread_name(current_thread(), thread_name);
4251 
4252 	/*
4253 	 * Treat the dedicated starter thread for lo0 as equivalent to
4254 	 * the driver workloop thread; if net_affinity is enabled for
4255 	 * the main input thread, associate this starter thread to it
4256 	 * by binding them with the same affinity tag.  This is done
4257 	 * only once (as we only have one lo_ifp which never goes away.)
4258 	 */
4259 	if (ifp == lo_ifp) {
4260 		struct dlil_threading_info *inp = dlil_main_input_thread;
4261 		struct thread *tp = current_thread();
4262 #if SKYWALK
4263 		/* native skywalk loopback not yet implemented */
4264 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4265 #endif /* SKYWALK */
4266 
4267 		lck_mtx_lock(&inp->dlth_lock);
4268 		if (inp->dlth_affinity) {
4269 			u_int32_t tag = inp->dlth_affinity_tag;
4270 
4271 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4272 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 			inp->dlth_driver_thread = tp;
4274 			lck_mtx_unlock(&inp->dlth_lock);
4275 
4276 			/* Associate this thread with the affinity tag */
4277 			(void) dlil_affinity_set(tp, tag);
4278 		} else {
4279 			lck_mtx_unlock(&inp->dlth_lock);
4280 		}
4281 	}
4282 
4283 	lck_mtx_lock(&ifp->if_start_lock);
4284 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4285 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4286 	ifp->if_start_embryonic = 1;
4287 	/* wake up once to get out of embryonic state */
4288 	ifp->if_start_req++;
4289 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4290 	lck_mtx_unlock(&ifp->if_start_lock);
4291 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4292 	/* NOTREACHED */
4293 	__builtin_unreachable();
4294 }
4295 
4296 __attribute__((noreturn))
4297 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4298 ifnet_start_thread_cont(void *v, wait_result_t wres)
4299 {
4300 	struct ifnet *ifp = v;
4301 	struct ifclassq *ifq = ifp->if_snd;
4302 
4303 	lck_mtx_lock_spin(&ifp->if_start_lock);
4304 	if (__improbable(wres == THREAD_INTERRUPTED ||
4305 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4306 		goto terminate;
4307 	}
4308 
4309 	if (__improbable(ifp->if_start_embryonic)) {
4310 		ifp->if_start_embryonic = 0;
4311 		lck_mtx_unlock(&ifp->if_start_lock);
4312 		ifnet_decr_pending_thread_count(ifp);
4313 		lck_mtx_lock_spin(&ifp->if_start_lock);
4314 		goto skip;
4315 	}
4316 
4317 	ifp->if_start_active = 1;
4318 
4319 	/*
4320 	 * Keep on servicing until no more request.
4321 	 */
4322 	for (;;) {
4323 		u_int32_t req = ifp->if_start_req;
4324 		if (!IFCQ_IS_EMPTY(ifq) &&
4325 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4326 		    ifp->if_start_delayed == 0 &&
4327 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4328 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4329 			ifp->if_start_delayed = 1;
4330 			ifnet_start_delayed++;
4331 			break;
4332 		}
4333 		ifp->if_start_delayed = 0;
4334 		lck_mtx_unlock(&ifp->if_start_lock);
4335 
4336 		/*
4337 		 * If no longer attached, don't call start because ifp
4338 		 * is being destroyed; else hold an IO refcnt to
4339 		 * prevent the interface from being detached (will be
4340 		 * released below.)
4341 		 */
4342 		if (!ifnet_datamov_begin(ifp)) {
4343 			lck_mtx_lock_spin(&ifp->if_start_lock);
4344 			break;
4345 		}
4346 
4347 		/* invoke the driver's start routine */
4348 		((*ifp->if_start)(ifp));
4349 
4350 		/*
4351 		 * Release the io ref count taken above.
4352 		 */
4353 		ifnet_datamov_end(ifp);
4354 
4355 		lck_mtx_lock_spin(&ifp->if_start_lock);
4356 
4357 		/*
4358 		 * If there's no pending request or if the
4359 		 * interface has been disabled, we're done.
4360 		 */
4361 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4362 		if (req == ifp->if_start_req ||
4363 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4364 			break;
4365 		}
4366 	}
4367 skip:
4368 	ifp->if_start_req = 0;
4369 	ifp->if_start_active = 0;
4370 
4371 #if SKYWALK
4372 	/*
4373 	 * Wakeup any waiters, e.g. any threads waiting to
4374 	 * detach the interface from the flowswitch, etc.
4375 	 */
4376 	if (ifp->if_start_waiters != 0) {
4377 		ifp->if_start_waiters = 0;
4378 		wakeup(&ifp->if_start_waiters);
4379 	}
4380 #endif /* SKYWALK */
4381 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4382 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4383 		struct timespec delay_start_ts;
4384 		struct timespec *ts;
4385 
4386 		/*
4387 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4388 		 * there are still packets in the send queue which haven't
4389 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4390 		 * until ifnet_start() is called again.
4391 		 */
4392 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4393 		    &ifp->if_start_cycle : NULL);
4394 
4395 		if (ts == NULL && ifp->if_start_delayed == 1) {
4396 			delay_start_ts.tv_sec = 0;
4397 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4398 			ts = &delay_start_ts;
4399 		}
4400 
4401 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4402 			ts = NULL;
4403 		}
4404 
4405 		if (__improbable(ts != NULL)) {
4406 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4407 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4408 		}
4409 
4410 		(void) assert_wait_deadline(&ifp->if_start_thread,
4411 		    THREAD_UNINT, deadline);
4412 		lck_mtx_unlock(&ifp->if_start_lock);
4413 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4414 		/* NOTREACHED */
4415 	} else {
4416 terminate:
4417 		/* interface is detached? */
4418 		ifnet_set_start_cycle(ifp, NULL);
4419 
4420 		/* clear if_start_thread to allow termination to continue */
4421 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4422 		ifp->if_start_thread = THREAD_NULL;
4423 		wakeup((caddr_t)&ifp->if_start_thread);
4424 		lck_mtx_unlock(&ifp->if_start_lock);
4425 
4426 		if (dlil_verbose) {
4427 			DLIL_PRINTF("%s: starter thread terminated\n",
4428 			    if_name(ifp));
4429 		}
4430 
4431 		/* for the extra refcnt from kernel_thread_start() */
4432 		thread_deallocate(current_thread());
4433 		/* this is the end */
4434 		thread_terminate(current_thread());
4435 		/* NOTREACHED */
4436 	}
4437 
4438 	/* must never get here */
4439 	VERIFY(0);
4440 	/* NOTREACHED */
4441 	__builtin_unreachable();
4442 }
4443 
4444 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4445 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4446 {
4447 	if (ts == NULL) {
4448 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4449 	} else {
4450 		*(&ifp->if_start_cycle) = *ts;
4451 	}
4452 
4453 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4454 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4455 		    if_name(ifp), ts->tv_nsec);
4456 	}
4457 }
4458 
4459 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4460 ifnet_poll_wakeup(struct ifnet *ifp)
4461 {
4462 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4463 
4464 	ifp->if_poll_req++;
4465 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4466 	    ifp->if_poll_thread != THREAD_NULL) {
4467 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4468 	}
4469 }
4470 
4471 void
ifnet_poll(struct ifnet * ifp)4472 ifnet_poll(struct ifnet *ifp)
4473 {
4474 	/*
4475 	 * If the poller thread is inactive, signal it to do work.
4476 	 */
4477 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4478 	ifnet_poll_wakeup(ifp);
4479 	lck_mtx_unlock(&ifp->if_poll_lock);
4480 }
4481 
4482 __attribute__((noreturn))
4483 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4484 ifnet_poll_thread_func(void *v, wait_result_t w)
4485 {
4486 #pragma unused(w)
4487 	char thread_name[MAXTHREADNAMESIZE];
4488 	struct ifnet *ifp = v;
4489 
4490 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4491 	VERIFY(current_thread() == ifp->if_poll_thread);
4492 
4493 	/* construct the name for this thread, and then apply it */
4494 	bzero(thread_name, sizeof(thread_name));
4495 	(void) snprintf(thread_name, sizeof(thread_name),
4496 	    "ifnet_poller_%s", ifp->if_xname);
4497 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4498 
4499 	lck_mtx_lock(&ifp->if_poll_lock);
4500 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4501 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4502 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4503 	/* wake up once to get out of embryonic state */
4504 	ifnet_poll_wakeup(ifp);
4505 	lck_mtx_unlock(&ifp->if_poll_lock);
4506 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4507 	/* NOTREACHED */
4508 	__builtin_unreachable();
4509 }
4510 
4511 __attribute__((noreturn))
4512 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4513 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4514 {
4515 	struct dlil_threading_info *inp;
4516 	struct ifnet *ifp = v;
4517 	struct ifnet_stat_increment_param s;
4518 	struct timespec start_time;
4519 
4520 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4521 
4522 	bzero(&s, sizeof(s));
4523 	net_timerclear(&start_time);
4524 
4525 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4526 	if (__improbable(wres == THREAD_INTERRUPTED ||
4527 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4528 		goto terminate;
4529 	}
4530 
4531 	inp = ifp->if_inp;
4532 	VERIFY(inp != NULL);
4533 
4534 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4535 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4536 		lck_mtx_unlock(&ifp->if_poll_lock);
4537 		ifnet_decr_pending_thread_count(ifp);
4538 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4539 		goto skip;
4540 	}
4541 
4542 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4543 
4544 	/*
4545 	 * Keep on servicing until no more request.
4546 	 */
4547 	for (;;) {
4548 		struct mbuf *m_head, *m_tail;
4549 		u_int32_t m_lim, m_cnt, m_totlen;
4550 		u_int16_t req = ifp->if_poll_req;
4551 
4552 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4553 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4554 		lck_mtx_unlock(&ifp->if_poll_lock);
4555 
4556 		/*
4557 		 * If no longer attached, there's nothing to do;
4558 		 * else hold an IO refcnt to prevent the interface
4559 		 * from being detached (will be released below.)
4560 		 */
4561 		if (!ifnet_is_attached(ifp, 1)) {
4562 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4563 			break;
4564 		}
4565 
4566 		if (dlil_verbose > 1) {
4567 			DLIL_PRINTF("%s: polling up to %d pkts, "
4568 			    "pkts avg %d max %d, wreq avg %d, "
4569 			    "bytes avg %d\n",
4570 			    if_name(ifp), m_lim,
4571 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4572 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4573 		}
4574 
4575 		/* invoke the driver's input poll routine */
4576 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4577 		&m_cnt, &m_totlen));
4578 
4579 		if (m_head != NULL) {
4580 			VERIFY(m_tail != NULL && m_cnt > 0);
4581 
4582 			if (dlil_verbose > 1) {
4583 				DLIL_PRINTF("%s: polled %d pkts, "
4584 				    "pkts avg %d max %d, wreq avg %d, "
4585 				    "bytes avg %d\n",
4586 				    if_name(ifp), m_cnt,
4587 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4588 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4589 			}
4590 
4591 			/* stats are required for extended variant */
4592 			s.packets_in = m_cnt;
4593 			s.bytes_in = m_totlen;
4594 
4595 			(void) ifnet_input_common(ifp, m_head, m_tail,
4596 			    &s, TRUE, TRUE);
4597 		} else {
4598 			if (dlil_verbose > 1) {
4599 				DLIL_PRINTF("%s: no packets, "
4600 				    "pkts avg %d max %d, wreq avg %d, "
4601 				    "bytes avg %d\n",
4602 				    if_name(ifp), ifp->if_rxpoll_pavg,
4603 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4604 				    ifp->if_rxpoll_bavg);
4605 			}
4606 
4607 			(void) ifnet_input_common(ifp, NULL, NULL,
4608 			    NULL, FALSE, TRUE);
4609 		}
4610 
4611 		/* Release the io ref count */
4612 		ifnet_decr_iorefcnt(ifp);
4613 
4614 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4615 
4616 		/* if there's no pending request, we're done */
4617 		if (req == ifp->if_poll_req ||
4618 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4619 			break;
4620 		}
4621 	}
4622 skip:
4623 	ifp->if_poll_req = 0;
4624 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4625 
4626 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4627 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4628 		struct timespec *ts;
4629 
4630 		/*
4631 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4632 		 * until ifnet_poll() is called again.
4633 		 */
4634 		ts = &ifp->if_poll_cycle;
4635 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4636 			ts = NULL;
4637 		}
4638 
4639 		if (ts != NULL) {
4640 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4641 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4642 		}
4643 
4644 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4645 		    THREAD_UNINT, deadline);
4646 		lck_mtx_unlock(&ifp->if_poll_lock);
4647 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4648 		/* NOTREACHED */
4649 	} else {
4650 terminate:
4651 		/* interface is detached (maybe while asleep)? */
4652 		ifnet_set_poll_cycle(ifp, NULL);
4653 
4654 		/* clear if_poll_thread to allow termination to continue */
4655 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4656 		ifp->if_poll_thread = THREAD_NULL;
4657 		wakeup((caddr_t)&ifp->if_poll_thread);
4658 		lck_mtx_unlock(&ifp->if_poll_lock);
4659 
4660 		if (dlil_verbose) {
4661 			DLIL_PRINTF("%s: poller thread terminated\n",
4662 			    if_name(ifp));
4663 		}
4664 
4665 		/* for the extra refcnt from kernel_thread_start() */
4666 		thread_deallocate(current_thread());
4667 		/* this is the end */
4668 		thread_terminate(current_thread());
4669 		/* NOTREACHED */
4670 	}
4671 
4672 	/* must never get here */
4673 	VERIFY(0);
4674 	/* NOTREACHED */
4675 	__builtin_unreachable();
4676 }
4677 
4678 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4679 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4680 {
4681 	if (ts == NULL) {
4682 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4683 	} else {
4684 		*(&ifp->if_poll_cycle) = *ts;
4685 	}
4686 
4687 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4688 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4689 		    if_name(ifp), ts->tv_nsec);
4690 	}
4691 }
4692 
4693 void
ifnet_purge(struct ifnet * ifp)4694 ifnet_purge(struct ifnet *ifp)
4695 {
4696 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4697 		if_qflush_snd(ifp, false);
4698 	}
4699 }
4700 
4701 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4702 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4703 {
4704 	IFCQ_LOCK_ASSERT_HELD(ifq);
4705 
4706 	if (!(IFCQ_IS_READY(ifq))) {
4707 		return;
4708 	}
4709 
4710 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4711 		struct tb_profile tb = {
4712 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4713 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4714 		};
4715 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4716 	}
4717 
4718 	ifclassq_update(ifq, ev);
4719 }
4720 
4721 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4722 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4723 {
4724 	switch (ev) {
4725 	case CLASSQ_EV_LINK_BANDWIDTH:
4726 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4727 			ifp->if_poll_update++;
4728 		}
4729 		break;
4730 
4731 	default:
4732 		break;
4733 	}
4734 }
4735 
4736 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4737 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4738 {
4739 	struct ifclassq *ifq;
4740 	u_int32_t omodel;
4741 	errno_t err;
4742 
4743 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4744 		return EINVAL;
4745 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4746 		return ENXIO;
4747 	}
4748 
4749 	ifq = ifp->if_snd;
4750 	IFCQ_LOCK(ifq);
4751 	omodel = ifp->if_output_sched_model;
4752 	ifp->if_output_sched_model = model;
4753 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4754 		ifp->if_output_sched_model = omodel;
4755 	}
4756 	IFCQ_UNLOCK(ifq);
4757 
4758 	return err;
4759 }
4760 
4761 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4762 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4763 {
4764 	if (ifp == NULL) {
4765 		return EINVAL;
4766 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4767 		return ENXIO;
4768 	}
4769 
4770 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4771 
4772 	return 0;
4773 }
4774 
4775 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4776 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4777 {
4778 	if (ifp == NULL || maxqlen == NULL) {
4779 		return EINVAL;
4780 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4781 		return ENXIO;
4782 	}
4783 
4784 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4785 
4786 	return 0;
4787 }
4788 
4789 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4790 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4791 {
4792 	errno_t err;
4793 
4794 	if (ifp == NULL || pkts == NULL) {
4795 		err = EINVAL;
4796 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4797 		err = ENXIO;
4798 	} else {
4799 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4800 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4801 	}
4802 
4803 	return err;
4804 }
4805 
4806 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4807 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4808     u_int32_t *pkts, u_int32_t *bytes)
4809 {
4810 	errno_t err;
4811 
4812 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4813 	    (pkts == NULL && bytes == NULL)) {
4814 		err = EINVAL;
4815 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4816 		err = ENXIO;
4817 	} else {
4818 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4819 		    pkts, bytes);
4820 	}
4821 
4822 	return err;
4823 }
4824 
4825 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4826 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4827 {
4828 	struct dlil_threading_info *inp;
4829 
4830 	if (ifp == NULL) {
4831 		return EINVAL;
4832 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4833 		return ENXIO;
4834 	}
4835 
4836 	if (maxqlen == 0) {
4837 		maxqlen = if_rcvq_maxlen;
4838 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4839 		maxqlen = IF_RCVQ_MINLEN;
4840 	}
4841 
4842 	inp = ifp->if_inp;
4843 	lck_mtx_lock(&inp->dlth_lock);
4844 	qlimit(&inp->dlth_pkts) = maxqlen;
4845 	lck_mtx_unlock(&inp->dlth_lock);
4846 
4847 	return 0;
4848 }
4849 
4850 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4851 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4852 {
4853 	struct dlil_threading_info *inp;
4854 
4855 	if (ifp == NULL || maxqlen == NULL) {
4856 		return EINVAL;
4857 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4858 		return ENXIO;
4859 	}
4860 
4861 	inp = ifp->if_inp;
4862 	lck_mtx_lock(&inp->dlth_lock);
4863 	*maxqlen = qlimit(&inp->dlth_pkts);
4864 	lck_mtx_unlock(&inp->dlth_lock);
4865 	return 0;
4866 }
4867 
4868 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4869 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4870     uint16_t delay_timeout)
4871 {
4872 	if (delay_qlen > 0 && delay_timeout > 0) {
4873 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4874 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4875 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4876 		/* convert timeout to nanoseconds */
4877 		ifp->if_start_delay_timeout *= 1000;
4878 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4879 		    ifp->if_xname, (uint32_t)delay_qlen,
4880 		    (uint32_t)delay_timeout);
4881 	} else {
4882 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4883 	}
4884 }
4885 
4886 /*
4887  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4888  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4889  * buf holds the full header.
4890  */
4891 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4892 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4893 {
4894 	struct ip *ip;
4895 	struct ip6_hdr *ip6;
4896 	uint8_t lbuf[64] __attribute__((aligned(8)));
4897 	uint8_t *p = buf;
4898 
4899 	if (ip_ver == IPVERSION) {
4900 		uint8_t old_tos;
4901 		uint32_t sum;
4902 
4903 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4904 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4905 			bcopy(buf, lbuf, sizeof(struct ip));
4906 			p = lbuf;
4907 		}
4908 		ip = (struct ip *)(void *)p;
4909 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4910 			return;
4911 		}
4912 
4913 		DTRACE_IP1(clear__v4, struct ip *, ip);
4914 		old_tos = ip->ip_tos;
4915 		ip->ip_tos &= IPTOS_ECN_MASK;
4916 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4917 		sum = (sum >> 16) + (sum & 0xffff);
4918 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4919 
4920 		if (__improbable(p == lbuf)) {
4921 			bcopy(lbuf, buf, sizeof(struct ip));
4922 		}
4923 	} else {
4924 		uint32_t flow;
4925 		ASSERT(ip_ver == IPV6_VERSION);
4926 
4927 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4928 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4929 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4930 			p = lbuf;
4931 		}
4932 		ip6 = (struct ip6_hdr *)(void *)p;
4933 		flow = ntohl(ip6->ip6_flow);
4934 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4935 			return;
4936 		}
4937 
4938 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4939 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4940 
4941 		if (__improbable(p == lbuf)) {
4942 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4943 		}
4944 	}
4945 }
4946 
4947 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4948 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4949     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4950 {
4951 #if SKYWALK
4952 	volatile struct sk_nexusadv *nxadv = NULL;
4953 #endif /* SKYWALK */
4954 	volatile uint64_t *fg_ts = NULL;
4955 	volatile uint64_t *rt_ts = NULL;
4956 	struct timespec now;
4957 	u_int64_t now_nsec = 0;
4958 	int error = 0;
4959 	uint8_t *mcast_buf = NULL;
4960 	uint8_t ip_ver;
4961 	uint32_t pktlen;
4962 
4963 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4964 #if SKYWALK
4965 	/*
4966 	 * If attached to flowswitch, grab pointers to the
4967 	 * timestamp variables in the nexus advisory region.
4968 	 */
4969 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4970 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4971 		fg_ts = &nxadv->nxadv_fg_sendts;
4972 		rt_ts = &nxadv->nxadv_rt_sendts;
4973 	}
4974 #endif /* SKYWALK */
4975 
4976 	/*
4977 	 * If packet already carries a timestamp, either from dlil_output()
4978 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4979 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4980 	 * the timestamp value is used internally there.
4981 	 */
4982 	switch (p->cp_ptype) {
4983 	case QP_MBUF:
4984 #if SKYWALK
4985 		/*
4986 		 * Valid only for non-native (compat) Skywalk interface.
4987 		 * If the data source uses packet, caller must convert
4988 		 * it to mbuf first prior to calling this routine.
4989 		 */
4990 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4991 #endif /* SKYWALK */
4992 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4993 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4994 
4995 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4996 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4997 			nanouptime(&now);
4998 			net_timernsec(&now, &now_nsec);
4999 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5000 		}
5001 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5002 		/*
5003 		 * If the packet service class is not background,
5004 		 * update the timestamp to indicate recent activity
5005 		 * on a foreground socket.
5006 		 */
5007 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5008 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5009 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5010 			    PKTF_SO_BACKGROUND)) {
5011 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5012 				if (fg_ts != NULL) {
5013 					*fg_ts = (uint32_t)_net_uptime;
5014 				}
5015 			}
5016 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5017 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5018 				if (rt_ts != NULL) {
5019 					*rt_ts = (uint32_t)_net_uptime;
5020 				}
5021 			}
5022 		}
5023 		pktlen = m_pktlen(p->cp_mbuf);
5024 
5025 		/*
5026 		 * Some Wi-Fi AP implementations do not correctly handle
5027 		 * multicast IP packets with DSCP bits set (radr://9331522).
5028 		 * As a workaround we clear the DSCP bits but keep service
5029 		 * class (rdar://51507725).
5030 		 */
5031 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5032 		    IFNET_IS_WIFI_INFRA(ifp)) {
5033 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5034 			struct ether_header *eh;
5035 			boolean_t pullup = FALSE;
5036 			uint16_t etype;
5037 
5038 			if (__improbable(len < sizeof(struct ether_header))) {
5039 				DTRACE_IP1(small__ether, size_t, len);
5040 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5041 				    sizeof(struct ether_header))) == NULL) {
5042 					return ENOMEM;
5043 				}
5044 			}
5045 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5046 			etype = ntohs(eh->ether_type);
5047 			if (etype == ETHERTYPE_IP) {
5048 				hlen = sizeof(struct ether_header) +
5049 				    sizeof(struct ip);
5050 				if (len < hlen) {
5051 					DTRACE_IP1(small__v4, size_t, len);
5052 					pullup = TRUE;
5053 				}
5054 				ip_ver = IPVERSION;
5055 			} else if (etype == ETHERTYPE_IPV6) {
5056 				hlen = sizeof(struct ether_header) +
5057 				    sizeof(struct ip6_hdr);
5058 				if (len < hlen) {
5059 					DTRACE_IP1(small__v6, size_t, len);
5060 					pullup = TRUE;
5061 				}
5062 				ip_ver = IPV6_VERSION;
5063 			} else {
5064 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5065 				break;
5066 			}
5067 			if (pullup) {
5068 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5069 				    NULL) {
5070 					return ENOMEM;
5071 				}
5072 
5073 				eh = (struct ether_header *)mbuf_data(
5074 					p->cp_mbuf);
5075 			}
5076 			mcast_buf = (uint8_t *)(eh + 1);
5077 			/*
5078 			 * ifnet_mcast_clear_dscp() will finish the work below.
5079 			 * Note that the pullups above ensure that mcast_buf
5080 			 * points to a full IP header.
5081 			 */
5082 		}
5083 		break;
5084 
5085 #if SKYWALK
5086 	case QP_PACKET:
5087 		/*
5088 		 * Valid only for native Skywalk interface.  If the data
5089 		 * source uses mbuf, caller must convert it to packet first
5090 		 * prior to calling this routine.
5091 		 */
5092 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5093 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5094 		    p->cp_kpkt->pkt_timestamp == 0) {
5095 			nanouptime(&now);
5096 			net_timernsec(&now, &now_nsec);
5097 			p->cp_kpkt->pkt_timestamp = now_nsec;
5098 		}
5099 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5100 		/*
5101 		 * If the packet service class is not background,
5102 		 * update the timestamps on the interface, as well as
5103 		 * the ones in nexus-wide advisory to indicate recent
5104 		 * activity on a foreground flow.
5105 		 */
5106 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5107 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5108 			if (fg_ts != NULL) {
5109 				*fg_ts = (uint32_t)_net_uptime;
5110 			}
5111 		}
5112 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5113 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5114 			if (rt_ts != NULL) {
5115 				*rt_ts = (uint32_t)_net_uptime;
5116 			}
5117 		}
5118 		pktlen = p->cp_kpkt->pkt_length;
5119 
5120 		/*
5121 		 * Some Wi-Fi AP implementations do not correctly handle
5122 		 * multicast IP packets with DSCP bits set (radr://9331522).
5123 		 * As a workaround we clear the DSCP bits but keep service
5124 		 * class (rdar://51507725).
5125 		 */
5126 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5127 		    IFNET_IS_WIFI_INFRA(ifp)) {
5128 			uint8_t *baddr;
5129 			struct ether_header *eh;
5130 			uint16_t etype;
5131 
5132 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5133 			baddr += p->cp_kpkt->pkt_headroom;
5134 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5135 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5136 				    p->cp_kpkt);
5137 				break;
5138 			}
5139 			eh = (struct ether_header *)(void *)baddr;
5140 			etype = ntohs(eh->ether_type);
5141 			if (etype == ETHERTYPE_IP) {
5142 				if (pktlen < sizeof(struct ether_header) +
5143 				    sizeof(struct ip)) {
5144 					DTRACE_IP1(pkt__small__v4, uint32_t,
5145 					    pktlen);
5146 					break;
5147 				}
5148 				ip_ver = IPVERSION;
5149 			} else if (etype == ETHERTYPE_IPV6) {
5150 				if (pktlen < sizeof(struct ether_header) +
5151 				    sizeof(struct ip6_hdr)) {
5152 					DTRACE_IP1(pkt__small__v6, uint32_t,
5153 					    pktlen);
5154 					break;
5155 				}
5156 				ip_ver = IPV6_VERSION;
5157 			} else {
5158 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5159 				    etype);
5160 				break;
5161 			}
5162 			mcast_buf = (uint8_t *)(eh + 1);
5163 			/*
5164 			 * ifnet_mcast_clear_dscp() will finish the work below.
5165 			 * The checks above verify that the IP header is in the
5166 			 * first buflet.
5167 			 */
5168 		}
5169 		break;
5170 #endif /* SKYWALK */
5171 
5172 	default:
5173 		VERIFY(0);
5174 		/* NOTREACHED */
5175 		__builtin_unreachable();
5176 	}
5177 
5178 	if (mcast_buf != NULL) {
5179 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5180 	}
5181 
5182 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5183 		if (now_nsec == 0) {
5184 			nanouptime(&now);
5185 			net_timernsec(&now, &now_nsec);
5186 		}
5187 		/*
5188 		 * If the driver chose to delay start callback for
5189 		 * coalescing multiple packets, Then use the following
5190 		 * heuristics to make sure that start callback will
5191 		 * be delayed only when bulk data transfer is detected.
5192 		 * 1. number of packets enqueued in (delay_win * 2) is
5193 		 * greater than or equal to the delay qlen.
5194 		 * 2. If delay_start is enabled it will stay enabled for
5195 		 * another 10 idle windows. This is to take into account
5196 		 * variable RTT and burst traffic.
5197 		 * 3. If the time elapsed since last enqueue is more
5198 		 * than 200ms we disable delaying start callback. This is
5199 		 * is to take idle time into account.
5200 		 */
5201 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5202 		if (ifp->if_start_delay_swin > 0) {
5203 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5204 				ifp->if_start_delay_cnt++;
5205 			} else if ((now_nsec - ifp->if_start_delay_swin)
5206 			    >= (200 * 1000 * 1000)) {
5207 				ifp->if_start_delay_swin = now_nsec;
5208 				ifp->if_start_delay_cnt = 1;
5209 				ifp->if_start_delay_idle = 0;
5210 				if (ifp->if_eflags & IFEF_DELAY_START) {
5211 					if_clear_eflags(ifp, IFEF_DELAY_START);
5212 					ifnet_delay_start_disabled_increment();
5213 				}
5214 			} else {
5215 				if (ifp->if_start_delay_cnt >=
5216 				    ifp->if_start_delay_qlen) {
5217 					if_set_eflags(ifp, IFEF_DELAY_START);
5218 					ifp->if_start_delay_idle = 0;
5219 				} else {
5220 					if (ifp->if_start_delay_idle >= 10) {
5221 						if_clear_eflags(ifp,
5222 						    IFEF_DELAY_START);
5223 						ifnet_delay_start_disabled_increment();
5224 					} else {
5225 						ifp->if_start_delay_idle++;
5226 					}
5227 				}
5228 				ifp->if_start_delay_swin = now_nsec;
5229 				ifp->if_start_delay_cnt = 1;
5230 			}
5231 		} else {
5232 			ifp->if_start_delay_swin = now_nsec;
5233 			ifp->if_start_delay_cnt = 1;
5234 			ifp->if_start_delay_idle = 0;
5235 			if_clear_eflags(ifp, IFEF_DELAY_START);
5236 		}
5237 	} else {
5238 		if_clear_eflags(ifp, IFEF_DELAY_START);
5239 	}
5240 
5241 	/* enqueue the packet (caller consumes object) */
5242 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5243 	    1, pktlen, pdrop);
5244 
5245 	/*
5246 	 * Tell the driver to start dequeueing; do this even when the queue
5247 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5248 	 * be dequeueing from other unsuspended queues.
5249 	 */
5250 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5251 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5252 		ifnet_start(ifp);
5253 	}
5254 
5255 	return error;
5256 }
5257 
5258 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5259 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5260     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5261     boolean_t flush, boolean_t *pdrop)
5262 {
5263 	int error;
5264 
5265 	/* enqueue the packet (caller consumes object) */
5266 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5267 	    cnt, bytes, pdrop);
5268 
5269 	/*
5270 	 * Tell the driver to start dequeueing; do this even when the queue
5271 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5272 	 * be dequeueing from other unsuspended queues.
5273 	 */
5274 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5275 		ifnet_start(ifp);
5276 	}
5277 	return error;
5278 }
5279 
5280 #if DEVELOPMENT || DEBUG
5281 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5282 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5283 {
5284 #define MIN_TRACE_DUMP_PKT_SIZE  32
5285 	struct ether_header *eh = NULL;
5286 	struct udphdr *uh = NULL;
5287 
5288 	if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5289 	    flow_key_trace.fk_ipver != IPV6_VERSION))) {
5290 		return;
5291 	}
5292 
5293 	uint16_t bdlim, bdlen, bdoff;
5294 	uint8_t *baddr;
5295 
5296 	MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5297 
5298 	if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5299 		if (!IFNET_IS_ETHERNET(ifp)) {
5300 			return;
5301 		}
5302 
5303 		sa_family_t af = AF_UNSPEC;
5304 		ASSERT(kpkt->pkt_l2_len > 0);
5305 
5306 		baddr += kpkt->pkt_headroom;
5307 		eh = (struct ether_header *)(void *)baddr;
5308 		if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5309 			return;
5310 		}
5311 		if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5312 			return;
5313 		}
5314 		uint16_t ether_type = ntohs(eh->ether_type);
5315 		if (ether_type == ETHERTYPE_IP) {
5316 			af = AF_INET;
5317 		} else if (ether_type == ETHERTYPE_IPV6) {
5318 			af = AF_INET6;
5319 		} else {
5320 			return;
5321 		}
5322 		flow_pkt_classify(kpkt, ifp, af, input);
5323 	}
5324 
5325 	if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5326 		return;
5327 	}
5328 
5329 	if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5330 		return;
5331 	}
5332 
5333 	uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5334 	uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5335 
5336 	if (kpkt->pkt_flow_udp_src != sport ||
5337 	    kpkt->pkt_flow_udp_dst != dport) {
5338 		return;
5339 	}
5340 
5341 	if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5342 		struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5343 		struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5344 		struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5345 
5346 		if (ip_header->ip_src.s_addr != saddr->s_addr ||
5347 		    ip_header->ip_dst.s_addr != daddr->s_addr) {
5348 			return;
5349 		}
5350 	} else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5351 		struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5352 		struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5353 		struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5354 
5355 		if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5356 		    !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5357 			return;
5358 		}
5359 	}
5360 
5361 	int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5362 
5363 	uint16_t pkt_payload_len = bdlim - bdoff;
5364 	pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5365 	pkt_payload_len -= udp_payload_offset;
5366 
5367 	if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5368 		uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5369 		uint8_t *payload = (uint8_t *)(uh + 1);
5370 
5371 		/* Trace 32 bytes of UDP transport payload */
5372 		uint64_t *trace1 = __DECONST(uint64_t *, payload);
5373 		uint64_t *trace2 = trace1 + 1;
5374 		uint64_t *trace3 = trace2 + 1;
5375 		uint64_t *trace4 = trace3 + 1;
5376 
5377 		if (input) {
5378 			KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5379 		} else {
5380 			KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5381 		}
5382 	}
5383 }
5384 #endif /* DEVELOPMENT || DEBUG */
5385 
5386 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5387 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5388 {
5389 	struct ifnet *ifp = handle;
5390 	boolean_t pdrop;        /* dummy */
5391 	uint32_t i;
5392 
5393 	ASSERT(n_pkts >= 1);
5394 	for (i = 0; i < n_pkts - 1; i++) {
5395 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5396 		    FALSE, &pdrop);
5397 	}
5398 	/* flush with the last packet */
5399 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5400 	    TRUE, &pdrop);
5401 
5402 	return 0;
5403 }
5404 
5405 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5406 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5407     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5408 {
5409 #if DEVELOPMENT || DEBUG
5410 	switch (pkt->cp_ptype) {
5411 	case QP_PACKET: {
5412 		trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5413 		break;
5414 	}
5415 	case QP_MBUF:
5416 	case QP_INVALID: {
5417 		break;
5418 	}
5419 	}
5420 #endif /* DEVELOPMENT || DEBUG */
5421 
5422 	if (ifp->if_output_netem != NULL) {
5423 		bool drop;
5424 		errno_t error;
5425 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5426 		*pdrop = drop ? TRUE : FALSE;
5427 		return error;
5428 	} else {
5429 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5430 	}
5431 }
5432 
5433 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5434 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5435 {
5436 	boolean_t pdrop;
5437 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5438 }
5439 
5440 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5441 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5442     boolean_t *pdrop)
5443 {
5444 	classq_pkt_t pkt;
5445 
5446 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5447 	    m->m_nextpkt != NULL) {
5448 		if (m != NULL) {
5449 			m_freem_list(m);
5450 			*pdrop = TRUE;
5451 		}
5452 		return EINVAL;
5453 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5454 	    !IF_FULLY_ATTACHED(ifp)) {
5455 		/* flag tested without lock for performance */
5456 		m_freem(m);
5457 		*pdrop = TRUE;
5458 		return ENXIO;
5459 	} else if (!(ifp->if_flags & IFF_UP)) {
5460 		m_freem(m);
5461 		*pdrop = TRUE;
5462 		return ENETDOWN;
5463 	}
5464 
5465 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5466 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5467 }
5468 
5469 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5470 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5471     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5472     boolean_t *pdrop)
5473 {
5474 	classq_pkt_t head, tail;
5475 
5476 	ASSERT(m_head != NULL);
5477 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5478 	ASSERT(m_tail != NULL);
5479 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5480 	ASSERT(ifp != NULL);
5481 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5482 
5483 	if (!IF_FULLY_ATTACHED(ifp)) {
5484 		/* flag tested without lock for performance */
5485 		m_freem_list(m_head);
5486 		*pdrop = TRUE;
5487 		return ENXIO;
5488 	} else if (!(ifp->if_flags & IFF_UP)) {
5489 		m_freem_list(m_head);
5490 		*pdrop = TRUE;
5491 		return ENETDOWN;
5492 	}
5493 
5494 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5495 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5496 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5497 	           flush, pdrop);
5498 }
5499 
5500 #if SKYWALK
5501 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5502 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5503     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5504 {
5505 	classq_pkt_t pkt;
5506 
5507 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5508 
5509 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5510 		if (kpkt != NULL) {
5511 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5512 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5513 			*pdrop = TRUE;
5514 		}
5515 		return EINVAL;
5516 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5517 	    !IF_FULLY_ATTACHED(ifp))) {
5518 		/* flag tested without lock for performance */
5519 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5520 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5521 		*pdrop = TRUE;
5522 		return ENXIO;
5523 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5524 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5525 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5526 		*pdrop = TRUE;
5527 		return ENETDOWN;
5528 	}
5529 
5530 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5531 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5532 }
5533 
5534 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5535 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5536     boolean_t flush, boolean_t *pdrop)
5537 {
5538 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5539 }
5540 
5541 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5542 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5543     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5544 {
5545 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5546 }
5547 
5548 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5549 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5550     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5551     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5552 {
5553 	classq_pkt_t head, tail;
5554 
5555 	ASSERT(k_head != NULL);
5556 	ASSERT(k_tail != NULL);
5557 	ASSERT(ifp != NULL);
5558 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5559 
5560 	if (!IF_FULLY_ATTACHED(ifp)) {
5561 		/* flag tested without lock for performance */
5562 		pp_free_packet_chain(k_head, NULL);
5563 		*pdrop = TRUE;
5564 		return ENXIO;
5565 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5566 		pp_free_packet_chain(k_head, NULL);
5567 		*pdrop = TRUE;
5568 		return ENETDOWN;
5569 	}
5570 
5571 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5572 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5573 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5574 	           flush, pdrop);
5575 }
5576 
5577 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5578 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5579     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5580     boolean_t *pdrop)
5581 {
5582 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5583 	           cnt, bytes, flush, pdrop);
5584 }
5585 
5586 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5587 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5588     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5589     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5590 {
5591 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5592 	           cnt, bytes, flush, pdrop);
5593 }
5594 #endif /* SKYWALK */
5595 
5596 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5597 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5598 {
5599 	errno_t rc;
5600 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5601 
5602 	if (ifp == NULL || mp == NULL) {
5603 		return EINVAL;
5604 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5605 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5606 		return ENXIO;
5607 	}
5608 	if (!ifnet_is_attached(ifp, 1)) {
5609 		return ENXIO;
5610 	}
5611 
5612 #if SKYWALK
5613 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5614 #endif /* SKYWALK */
5615 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5616 	    &pkt, NULL, NULL, NULL, 0);
5617 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5618 	ifnet_decr_iorefcnt(ifp);
5619 	*mp = pkt.cp_mbuf;
5620 	return rc;
5621 }
5622 
5623 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5624 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5625     struct mbuf **mp)
5626 {
5627 	errno_t rc;
5628 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5629 
5630 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5631 		return EINVAL;
5632 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5633 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5634 		return ENXIO;
5635 	}
5636 	if (!ifnet_is_attached(ifp, 1)) {
5637 		return ENXIO;
5638 	}
5639 
5640 #if SKYWALK
5641 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5642 #endif /* SKYWALK */
5643 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5644 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5645 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5646 	ifnet_decr_iorefcnt(ifp);
5647 	*mp = pkt.cp_mbuf;
5648 	return rc;
5649 }
5650 
5651 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5652 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5653     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5654 {
5655 	errno_t rc;
5656 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5657 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5658 
5659 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5660 		return EINVAL;
5661 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5662 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5663 		return ENXIO;
5664 	}
5665 	if (!ifnet_is_attached(ifp, 1)) {
5666 		return ENXIO;
5667 	}
5668 
5669 #if SKYWALK
5670 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5671 #endif /* SKYWALK */
5672 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5673 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5674 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5675 	ifnet_decr_iorefcnt(ifp);
5676 	*head = pkt_head.cp_mbuf;
5677 	if (tail != NULL) {
5678 		*tail = pkt_tail.cp_mbuf;
5679 	}
5680 	return rc;
5681 }
5682 
5683 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5684 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5685     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5686 {
5687 	errno_t rc;
5688 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5689 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5690 
5691 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5692 		return EINVAL;
5693 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5694 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5695 		return ENXIO;
5696 	}
5697 	if (!ifnet_is_attached(ifp, 1)) {
5698 		return ENXIO;
5699 	}
5700 
5701 #if SKYWALK
5702 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5703 #endif /* SKYWALK */
5704 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5705 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5706 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5707 	ifnet_decr_iorefcnt(ifp);
5708 	*head = pkt_head.cp_mbuf;
5709 	if (tail != NULL) {
5710 		*tail = pkt_tail.cp_mbuf;
5711 	}
5712 	return rc;
5713 }
5714 
5715 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5716 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5717     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5718     u_int32_t *len)
5719 {
5720 	errno_t rc;
5721 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5722 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5723 
5724 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5725 	    !MBUF_VALID_SC(sc)) {
5726 		return EINVAL;
5727 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5728 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5729 		return ENXIO;
5730 	}
5731 	if (!ifnet_is_attached(ifp, 1)) {
5732 		return ENXIO;
5733 	}
5734 
5735 #if SKYWALK
5736 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5737 #endif /* SKYWALK */
5738 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5739 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5740 	    cnt, len, 0);
5741 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5742 	ifnet_decr_iorefcnt(ifp);
5743 	*head = pkt_head.cp_mbuf;
5744 	if (tail != NULL) {
5745 		*tail = pkt_tail.cp_mbuf;
5746 	}
5747 	return rc;
5748 }
5749 
5750 #if XNU_TARGET_OS_OSX
5751 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5752 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5753     const struct sockaddr *dest, const char *dest_linkaddr,
5754     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5755 {
5756 	if (pre != NULL) {
5757 		*pre = 0;
5758 	}
5759 	if (post != NULL) {
5760 		*post = 0;
5761 	}
5762 
5763 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5764 }
5765 #endif /* XNU_TARGET_OS_OSX */
5766 
5767 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5768 packet_has_vlan_tag(struct mbuf * m)
5769 {
5770 	u_int   tag = 0;
5771 
5772 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5773 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5774 		if (tag == 0) {
5775 			/* the packet is just priority-tagged, clear the bit */
5776 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5777 		}
5778 	}
5779 	return tag != 0;
5780 }
5781 
5782 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5783 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5784     char **frame_header_p, protocol_family_t protocol_family)
5785 {
5786 	boolean_t               is_vlan_packet = FALSE;
5787 	struct ifnet_filter     *filter;
5788 	struct mbuf             *m = *m_p;
5789 
5790 	is_vlan_packet = packet_has_vlan_tag(m);
5791 
5792 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5793 		return 0;
5794 	}
5795 
5796 	/*
5797 	 * Pass the inbound packet to the interface filters
5798 	 */
5799 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5800 	/* prevent filter list from changing in case we drop the lock */
5801 	if_flt_monitor_busy(ifp);
5802 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5803 		int result;
5804 
5805 		/* exclude VLAN packets from external filters PR-3586856 */
5806 		if (is_vlan_packet &&
5807 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5808 			continue;
5809 		}
5810 
5811 		if (!filter->filt_skip && filter->filt_input != NULL &&
5812 		    (filter->filt_protocol == 0 ||
5813 		    filter->filt_protocol == protocol_family)) {
5814 			lck_mtx_unlock(&ifp->if_flt_lock);
5815 
5816 			result = (*filter->filt_input)(filter->filt_cookie,
5817 			    ifp, protocol_family, m_p, frame_header_p);
5818 
5819 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5820 			if (result != 0) {
5821 				/* we're done with the filter list */
5822 				if_flt_monitor_unbusy(ifp);
5823 				lck_mtx_unlock(&ifp->if_flt_lock);
5824 				return result;
5825 			}
5826 		}
5827 	}
5828 	/* we're done with the filter list */
5829 	if_flt_monitor_unbusy(ifp);
5830 	lck_mtx_unlock(&ifp->if_flt_lock);
5831 
5832 	/*
5833 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5834 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5835 	 */
5836 	if (*m_p != NULL) {
5837 		(*m_p)->m_flags &= ~M_PROTO1;
5838 	}
5839 
5840 	return 0;
5841 }
5842 
5843 __attribute__((noinline))
5844 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5845 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5846     protocol_family_t protocol_family)
5847 {
5848 	boolean_t               is_vlan_packet;
5849 	struct ifnet_filter     *filter;
5850 	struct mbuf             *m = *m_p;
5851 
5852 	is_vlan_packet = packet_has_vlan_tag(m);
5853 
5854 	/*
5855 	 * Pass the outbound packet to the interface filters
5856 	 */
5857 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5858 	/* prevent filter list from changing in case we drop the lock */
5859 	if_flt_monitor_busy(ifp);
5860 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5861 		int result;
5862 
5863 		/* exclude VLAN packets from external filters PR-3586856 */
5864 		if (is_vlan_packet &&
5865 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5866 			continue;
5867 		}
5868 
5869 		if (!filter->filt_skip && filter->filt_output != NULL &&
5870 		    (filter->filt_protocol == 0 ||
5871 		    filter->filt_protocol == protocol_family)) {
5872 			lck_mtx_unlock(&ifp->if_flt_lock);
5873 
5874 			result = filter->filt_output(filter->filt_cookie, ifp,
5875 			    protocol_family, m_p);
5876 
5877 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5878 			if (result != 0) {
5879 				/* we're done with the filter list */
5880 				if_flt_monitor_unbusy(ifp);
5881 				lck_mtx_unlock(&ifp->if_flt_lock);
5882 				return result;
5883 			}
5884 		}
5885 	}
5886 	/* we're done with the filter list */
5887 	if_flt_monitor_unbusy(ifp);
5888 	lck_mtx_unlock(&ifp->if_flt_lock);
5889 
5890 	return 0;
5891 }
5892 
5893 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5894 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5895 {
5896 	int error;
5897 
5898 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5899 		/* Version 1 protocols get one packet at a time */
5900 		while (m != NULL) {
5901 			char *  frame_header;
5902 			mbuf_t  next_packet;
5903 
5904 			next_packet = m->m_nextpkt;
5905 			m->m_nextpkt = NULL;
5906 			frame_header = m->m_pkthdr.pkt_hdr;
5907 			m->m_pkthdr.pkt_hdr = NULL;
5908 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5909 			    ifproto->protocol_family, m, frame_header);
5910 			if (error != 0 && error != EJUSTRETURN) {
5911 				m_freem(m);
5912 			}
5913 			m = next_packet;
5914 		}
5915 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5916 		/* Version 2 protocols support packet lists */
5917 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5918 		    ifproto->protocol_family, m);
5919 		if (error != 0 && error != EJUSTRETURN) {
5920 			m_freem_list(m);
5921 		}
5922 	}
5923 }
5924 
5925 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5926 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5927     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5928 {
5929 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5930 
5931 	if (s->packets_in != 0) {
5932 		d->packets_in += s->packets_in;
5933 	}
5934 	if (s->bytes_in != 0) {
5935 		d->bytes_in += s->bytes_in;
5936 	}
5937 	if (s->errors_in != 0) {
5938 		d->errors_in += s->errors_in;
5939 	}
5940 
5941 	if (s->packets_out != 0) {
5942 		d->packets_out += s->packets_out;
5943 	}
5944 	if (s->bytes_out != 0) {
5945 		d->bytes_out += s->bytes_out;
5946 	}
5947 	if (s->errors_out != 0) {
5948 		d->errors_out += s->errors_out;
5949 	}
5950 
5951 	if (s->collisions != 0) {
5952 		d->collisions += s->collisions;
5953 	}
5954 	if (s->dropped != 0) {
5955 		d->dropped += s->dropped;
5956 	}
5957 
5958 	if (poll) {
5959 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5960 	}
5961 }
5962 
5963 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5964 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5965 {
5966 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5967 
5968 	/*
5969 	 * Use of atomic operations is unavoidable here because
5970 	 * these stats may also be incremented elsewhere via KPIs.
5971 	 */
5972 	if (s->packets_in != 0) {
5973 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5974 		s->packets_in = 0;
5975 	}
5976 	if (s->bytes_in != 0) {
5977 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5978 		s->bytes_in = 0;
5979 	}
5980 	if (s->errors_in != 0) {
5981 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5982 		s->errors_in = 0;
5983 	}
5984 
5985 	if (s->packets_out != 0) {
5986 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5987 		s->packets_out = 0;
5988 	}
5989 	if (s->bytes_out != 0) {
5990 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5991 		s->bytes_out = 0;
5992 	}
5993 	if (s->errors_out != 0) {
5994 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5995 		s->errors_out = 0;
5996 	}
5997 
5998 	if (s->collisions != 0) {
5999 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
6000 		s->collisions = 0;
6001 	}
6002 	if (s->dropped != 0) {
6003 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6004 		s->dropped = 0;
6005 	}
6006 
6007 	/*
6008 	 * No need for atomic operations as they are modified here
6009 	 * only from within the DLIL input thread context.
6010 	 */
6011 	if (ifp->if_poll_tstats.packets != 0) {
6012 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6013 		ifp->if_poll_tstats.packets = 0;
6014 	}
6015 	if (ifp->if_poll_tstats.bytes != 0) {
6016 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6017 		ifp->if_poll_tstats.bytes = 0;
6018 	}
6019 
6020 	return ifp->if_data_threshold != 0;
6021 }
6022 
6023 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6024 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6025 {
6026 	return dlil_input_packet_list_common(ifp, m, 0,
6027 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6028 }
6029 
6030 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6031 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6032     u_int32_t cnt, ifnet_model_t mode)
6033 {
6034 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6035 }
6036 
6037 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6038 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6039     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6040 {
6041 	int error = 0;
6042 	protocol_family_t protocol_family;
6043 	mbuf_t next_packet;
6044 	ifnet_t ifp = ifp_param;
6045 	char *frame_header = NULL;
6046 	struct if_proto *last_ifproto = NULL;
6047 	mbuf_t pkt_first = NULL;
6048 	mbuf_t *pkt_next = NULL;
6049 	u_int32_t poll_thresh = 0, poll_ival = 0;
6050 	int iorefcnt = 0;
6051 
6052 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6053 
6054 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6055 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6056 		poll_thresh = cnt;
6057 	}
6058 
6059 	while (m != NULL) {
6060 		struct if_proto *ifproto = NULL;
6061 		uint32_t pktf_mask;     /* pkt flags to preserve */
6062 
6063 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6064 
6065 		if (ifp_param == NULL) {
6066 			ifp = m->m_pkthdr.rcvif;
6067 		}
6068 
6069 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6070 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6071 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6072 			ifnet_poll(ifp);
6073 		}
6074 
6075 		/* Check if this mbuf looks valid */
6076 		MBUF_INPUT_CHECK(m, ifp);
6077 
6078 		next_packet = m->m_nextpkt;
6079 		m->m_nextpkt = NULL;
6080 		frame_header = m->m_pkthdr.pkt_hdr;
6081 		m->m_pkthdr.pkt_hdr = NULL;
6082 
6083 		/*
6084 		 * Get an IO reference count if the interface is not
6085 		 * loopback (lo0) and it is attached; lo0 never goes
6086 		 * away, so optimize for that.
6087 		 */
6088 		if (ifp != lo_ifp) {
6089 			/* iorefcnt is 0 if it hasn't been taken yet */
6090 			if (iorefcnt == 0) {
6091 				if (!ifnet_datamov_begin(ifp)) {
6092 					m_freem(m);
6093 					goto next;
6094 				}
6095 			}
6096 			iorefcnt = 1;
6097 			/*
6098 			 * Preserve the time stamp and skip pktap flags.
6099 			 */
6100 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6101 		} else {
6102 			/*
6103 			 * If this arrived on lo0, preserve interface addr
6104 			 * info to allow for connectivity between loopback
6105 			 * and local interface addresses.
6106 			 */
6107 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6108 		}
6109 		pktf_mask |= PKTF_WAKE_PKT;
6110 
6111 		/* make sure packet comes in clean */
6112 		m_classifier_init(m, pktf_mask);
6113 
6114 		ifp_inc_traffic_class_in(ifp, m);
6115 
6116 		/* find which protocol family this packet is for */
6117 		ifnet_lock_shared(ifp);
6118 		error = (*ifp->if_demux)(ifp, m, frame_header,
6119 		    &protocol_family);
6120 		ifnet_lock_done(ifp);
6121 		if (error != 0) {
6122 			if (error == EJUSTRETURN) {
6123 				goto next;
6124 			}
6125 			protocol_family = 0;
6126 		}
6127 
6128 #if (DEVELOPMENT || DEBUG)
6129 		/*
6130 		 * For testing we do not care about broadcast and multicast packets as
6131 		 * they are not as controllable as unicast traffic
6132 		 */
6133 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6134 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6135 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6136 				/*
6137 				 * This is a one-shot command
6138 				 */
6139 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6140 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6141 			}
6142 		}
6143 #endif /* (DEVELOPMENT || DEBUG) */
6144 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6145 			char buffer[64];
6146 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6147 
6148 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6149 			    ifp->if_xname, m_pktlen(m));
6150 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6151 				log_hexdump(buffer, buflen);
6152 			}
6153 		}
6154 
6155 		pktap_input(ifp, protocol_family, m, frame_header);
6156 
6157 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6158 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6159 		    ifp->if_type == IFT_CELLULAR) {
6160 			m_freem(m);
6161 			ip6stat.ip6s_clat464_in_v4_drop++;
6162 			goto next;
6163 		}
6164 
6165 		/* Translate the packet if it is received on CLAT interface */
6166 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6167 		    && dlil_is_clat_needed(protocol_family, m)) {
6168 			char *data = NULL;
6169 			struct ether_header eh;
6170 			struct ether_header *ehp = NULL;
6171 
6172 			if (ifp->if_type == IFT_ETHER) {
6173 				ehp = (struct ether_header *)(void *)frame_header;
6174 				/* Skip RX Ethernet packets if they are not IPV6 */
6175 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6176 					goto skip_clat;
6177 				}
6178 
6179 				/* Keep a copy of frame_header for Ethernet packets */
6180 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6181 			}
6182 			error = dlil_clat64(ifp, &protocol_family, &m);
6183 			data = (char *) mbuf_data(m);
6184 			if (error != 0) {
6185 				m_freem(m);
6186 				ip6stat.ip6s_clat464_in_drop++;
6187 				goto next;
6188 			}
6189 			/* Native v6 should be No-op */
6190 			if (protocol_family != PF_INET) {
6191 				goto skip_clat;
6192 			}
6193 
6194 			/* Do this only for translated v4 packets. */
6195 			switch (ifp->if_type) {
6196 			case IFT_CELLULAR:
6197 				frame_header = data;
6198 				break;
6199 			case IFT_ETHER:
6200 				/*
6201 				 * Drop if the mbuf doesn't have enough
6202 				 * space for Ethernet header
6203 				 */
6204 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6205 					m_free(m);
6206 					ip6stat.ip6s_clat464_in_drop++;
6207 					goto next;
6208 				}
6209 				/*
6210 				 * Set the frame_header ETHER_HDR_LEN bytes
6211 				 * preceeding the data pointer. Change
6212 				 * the ether_type too.
6213 				 */
6214 				frame_header = data - ETHER_HDR_LEN;
6215 				eh.ether_type = htons(ETHERTYPE_IP);
6216 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6217 				break;
6218 			}
6219 		}
6220 skip_clat:
6221 		/*
6222 		 * Match the wake packet against the list of ports that has been
6223 		 * been queried by the driver before the device went to sleep
6224 		 */
6225 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6226 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6227 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6228 			}
6229 		}
6230 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6231 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6232 			dlil_input_cksum_dbg(ifp, m, frame_header,
6233 			    protocol_family);
6234 		}
6235 		/*
6236 		 * For partial checksum offload, we expect the driver to
6237 		 * set the start offset indicating the start of the span
6238 		 * that is covered by the hardware-computed checksum;
6239 		 * adjust this start offset accordingly because the data
6240 		 * pointer has been advanced beyond the link-layer header.
6241 		 *
6242 		 * Virtual lan types (bridge, vlan, bond) can call
6243 		 * dlil_input_packet_list() with the same packet with the
6244 		 * checksum flags set. Set a flag indicating that the
6245 		 * adjustment has already been done.
6246 		 */
6247 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6248 			/* adjustment has already been done */
6249 		} else if ((m->m_pkthdr.csum_flags &
6250 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6251 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6252 			int adj;
6253 			if (frame_header == NULL ||
6254 			    frame_header < (char *)mbuf_datastart(m) ||
6255 			    frame_header > (char *)m->m_data ||
6256 			    (adj = (int)(m->m_data - frame_header)) >
6257 			    m->m_pkthdr.csum_rx_start) {
6258 				m->m_pkthdr.csum_data = 0;
6259 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6260 				hwcksum_in_invalidated++;
6261 			} else {
6262 				m->m_pkthdr.csum_rx_start -= adj;
6263 			}
6264 			/* make sure we don't adjust more than once */
6265 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6266 		}
6267 		if (clat_debug) {
6268 			pktap_input(ifp, protocol_family, m, frame_header);
6269 		}
6270 
6271 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6272 			atomic_add_64(&ifp->if_imcasts, 1);
6273 		}
6274 
6275 		/* run interface filters */
6276 		error = dlil_interface_filters_input(ifp, &m,
6277 		    &frame_header, protocol_family);
6278 		if (error != 0) {
6279 			if (error != EJUSTRETURN) {
6280 				m_freem(m);
6281 			}
6282 			goto next;
6283 		}
6284 		/*
6285 		 * A VLAN interface receives VLAN-tagged packets by attaching
6286 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6287 		 * interface is a member of a bridge, the parent interface
6288 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6289 		 * M_PROMISC packet must be processed by the VLAN protocol
6290 		 * so that it can be sent up the stack via
6291 		 * dlil_input_packet_list(). That allows the bridge interface's
6292 		 * input filter, attached to the VLAN interface, to process
6293 		 * the packet.
6294 		 */
6295 		if (protocol_family != PF_VLAN &&
6296 		    (m->m_flags & M_PROMISC) != 0) {
6297 			m_freem(m);
6298 			goto next;
6299 		}
6300 
6301 		/* Lookup the protocol attachment to this interface */
6302 		if (protocol_family == 0) {
6303 			ifproto = NULL;
6304 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6305 		    (last_ifproto->protocol_family == protocol_family)) {
6306 			VERIFY(ifproto == NULL);
6307 			ifproto = last_ifproto;
6308 			if_proto_ref(last_ifproto);
6309 		} else {
6310 			VERIFY(ifproto == NULL);
6311 			ifnet_lock_shared(ifp);
6312 			/* callee holds a proto refcnt upon success */
6313 			ifproto = find_attached_proto(ifp, protocol_family);
6314 			ifnet_lock_done(ifp);
6315 		}
6316 		if (ifproto == NULL) {
6317 			/* no protocol for this packet, discard */
6318 			m_freem(m);
6319 			goto next;
6320 		}
6321 		if (ifproto != last_ifproto) {
6322 			if (last_ifproto != NULL) {
6323 				/* pass up the list for the previous protocol */
6324 				dlil_ifproto_input(last_ifproto, pkt_first);
6325 				pkt_first = NULL;
6326 				if_proto_free(last_ifproto);
6327 			}
6328 			last_ifproto = ifproto;
6329 			if_proto_ref(ifproto);
6330 		}
6331 		/* extend the list */
6332 		m->m_pkthdr.pkt_hdr = frame_header;
6333 		if (pkt_first == NULL) {
6334 			pkt_first = m;
6335 		} else {
6336 			*pkt_next = m;
6337 		}
6338 		pkt_next = &m->m_nextpkt;
6339 
6340 next:
6341 		if (next_packet == NULL && last_ifproto != NULL) {
6342 			/* pass up the last list of packets */
6343 			dlil_ifproto_input(last_ifproto, pkt_first);
6344 			if_proto_free(last_ifproto);
6345 			last_ifproto = NULL;
6346 		}
6347 		if (ifproto != NULL) {
6348 			if_proto_free(ifproto);
6349 			ifproto = NULL;
6350 		}
6351 
6352 		m = next_packet;
6353 
6354 		/* update the driver's multicast filter, if needed */
6355 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6356 			ifp->if_updatemcasts = 0;
6357 		}
6358 		if (iorefcnt == 1) {
6359 			/* If the next mbuf is on a different interface, unlock data-mov */
6360 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6361 				ifnet_datamov_end(ifp);
6362 				iorefcnt = 0;
6363 			}
6364 		}
6365 	}
6366 
6367 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6368 }
6369 
6370 errno_t
if_mcasts_update(struct ifnet * ifp)6371 if_mcasts_update(struct ifnet *ifp)
6372 {
6373 	errno_t err;
6374 
6375 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6376 	if (err == EAFNOSUPPORT) {
6377 		err = 0;
6378 	}
6379 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6380 	    "(err=%d)\n", if_name(ifp),
6381 	    (err == 0 ? "successfully restored" : "failed to restore"),
6382 	    ifp->if_updatemcasts, err);
6383 
6384 	/* just return success */
6385 	return 0;
6386 }
6387 
6388 /* If ifp is set, we will increment the generation for the interface */
6389 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6390 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6391 {
6392 	if (ifp != NULL) {
6393 		ifnet_increment_generation(ifp);
6394 	}
6395 
6396 #if NECP
6397 	necp_update_all_clients();
6398 #endif /* NECP */
6399 
6400 	return kev_post_msg(event);
6401 }
6402 
6403 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6404 dlil_post_sifflags_msg(struct ifnet * ifp)
6405 {
6406 	struct kev_msg ev_msg;
6407 	struct net_event_data ev_data;
6408 
6409 	bzero(&ev_data, sizeof(ev_data));
6410 	bzero(&ev_msg, sizeof(ev_msg));
6411 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6412 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6413 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6414 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6415 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6416 	ev_data.if_family = ifp->if_family;
6417 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6418 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6419 	ev_msg.dv[0].data_ptr = &ev_data;
6420 	ev_msg.dv[1].data_length = 0;
6421 	dlil_post_complete_msg(ifp, &ev_msg);
6422 }
6423 
6424 #define TMP_IF_PROTO_ARR_SIZE   10
6425 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6426 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6427 {
6428 	struct ifnet_filter *filter = NULL;
6429 	struct if_proto *proto = NULL;
6430 	int if_proto_count = 0;
6431 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6432 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6433 	int tmp_ifproto_arr_idx = 0;
6434 
6435 	/*
6436 	 * Pass the event to the interface filters
6437 	 */
6438 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6439 	/* prevent filter list from changing in case we drop the lock */
6440 	if_flt_monitor_busy(ifp);
6441 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6442 		if (filter->filt_event != NULL) {
6443 			lck_mtx_unlock(&ifp->if_flt_lock);
6444 
6445 			filter->filt_event(filter->filt_cookie, ifp,
6446 			    filter->filt_protocol, event);
6447 
6448 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6449 		}
6450 	}
6451 	/* we're done with the filter list */
6452 	if_flt_monitor_unbusy(ifp);
6453 	lck_mtx_unlock(&ifp->if_flt_lock);
6454 
6455 	/* Get an io ref count if the interface is attached */
6456 	if (!ifnet_is_attached(ifp, 1)) {
6457 		goto done;
6458 	}
6459 
6460 	/*
6461 	 * An embedded tmp_list_entry in if_proto may still get
6462 	 * over-written by another thread after giving up ifnet lock,
6463 	 * therefore we are avoiding embedded pointers here.
6464 	 */
6465 	ifnet_lock_shared(ifp);
6466 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6467 	if (if_proto_count) {
6468 		int i;
6469 		VERIFY(ifp->if_proto_hash != NULL);
6470 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6471 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6472 		} else {
6473 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6474 			    if_proto_count, Z_WAITOK | Z_ZERO);
6475 			if (tmp_ifproto_arr == NULL) {
6476 				ifnet_lock_done(ifp);
6477 				goto cleanup;
6478 			}
6479 		}
6480 
6481 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6482 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6483 			    next_hash) {
6484 				if_proto_ref(proto);
6485 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6486 				tmp_ifproto_arr_idx++;
6487 			}
6488 		}
6489 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6490 	}
6491 	ifnet_lock_done(ifp);
6492 
6493 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6494 	    tmp_ifproto_arr_idx++) {
6495 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6496 		VERIFY(proto != NULL);
6497 		proto_media_event eventp =
6498 		    (proto->proto_kpi == kProtoKPI_v1 ?
6499 		    proto->kpi.v1.event :
6500 		    proto->kpi.v2.event);
6501 
6502 		if (eventp != NULL) {
6503 			eventp(ifp, proto->protocol_family,
6504 			    event);
6505 		}
6506 		if_proto_free(proto);
6507 	}
6508 
6509 cleanup:
6510 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6511 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6512 	}
6513 
6514 	/* Pass the event to the interface */
6515 	if (ifp->if_event != NULL) {
6516 		ifp->if_event(ifp, event);
6517 	}
6518 
6519 	/* Release the io ref count */
6520 	ifnet_decr_iorefcnt(ifp);
6521 done:
6522 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6523 }
6524 
6525 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6526 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6527 {
6528 	struct kev_msg kev_msg;
6529 	int result = 0;
6530 
6531 	if (ifp == NULL || event == NULL) {
6532 		return EINVAL;
6533 	}
6534 
6535 	bzero(&kev_msg, sizeof(kev_msg));
6536 	kev_msg.vendor_code = event->vendor_code;
6537 	kev_msg.kev_class = event->kev_class;
6538 	kev_msg.kev_subclass = event->kev_subclass;
6539 	kev_msg.event_code = event->event_code;
6540 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6541 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6542 	kev_msg.dv[1].data_length = 0;
6543 
6544 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6545 
6546 	return result;
6547 }
6548 
6549 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6550 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6551 {
6552 	mbuf_t  n = m;
6553 	int chainlen = 0;
6554 
6555 	while (n != NULL) {
6556 		chainlen++;
6557 		n = n->m_next;
6558 	}
6559 	switch (chainlen) {
6560 	case 0:
6561 		break;
6562 	case 1:
6563 		atomic_add_64(&cls->cls_one, 1);
6564 		break;
6565 	case 2:
6566 		atomic_add_64(&cls->cls_two, 1);
6567 		break;
6568 	case 3:
6569 		atomic_add_64(&cls->cls_three, 1);
6570 		break;
6571 	case 4:
6572 		atomic_add_64(&cls->cls_four, 1);
6573 		break;
6574 	case 5:
6575 	default:
6576 		atomic_add_64(&cls->cls_five_or_more, 1);
6577 		break;
6578 	}
6579 }
6580 
6581 #if CONFIG_DTRACE
6582 __attribute__((noinline))
6583 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6584 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6585 {
6586 	if (proto_family == PF_INET) {
6587 		struct ip *ip = mtod(m, struct ip *);
6588 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6589 		    struct ip *, ip, struct ifnet *, ifp,
6590 		    struct ip *, ip, struct ip6_hdr *, NULL);
6591 	} else if (proto_family == PF_INET6) {
6592 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6593 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6594 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6595 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6596 	}
6597 }
6598 #endif /* CONFIG_DTRACE */
6599 
6600 /*
6601  * dlil_output
6602  *
6603  * Caller should have a lock on the protocol domain if the protocol
6604  * doesn't support finer grained locking. In most cases, the lock
6605  * will be held from the socket layer and won't be released until
6606  * we return back to the socket layer.
6607  *
6608  * This does mean that we must take a protocol lock before we take
6609  * an interface lock if we're going to take both. This makes sense
6610  * because a protocol is likely to interact with an ifp while it
6611  * is under the protocol lock.
6612  *
6613  * An advisory code will be returned if adv is not null. This
6614  * can be used to provide feedback about interface queues to the
6615  * application.
6616  */
6617 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6618 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6619     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6620 {
6621 	char *frame_type = NULL;
6622 	char *dst_linkaddr = NULL;
6623 	int retval = 0;
6624 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6625 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6626 	struct if_proto *proto = NULL;
6627 	mbuf_t  m = NULL;
6628 	mbuf_t  send_head = NULL;
6629 	mbuf_t  *send_tail = &send_head;
6630 	int iorefcnt = 0;
6631 	u_int32_t pre = 0, post = 0;
6632 	u_int32_t fpkts = 0, fbytes = 0;
6633 	int32_t flen = 0;
6634 	struct timespec now;
6635 	u_int64_t now_nsec;
6636 	boolean_t did_clat46 = FALSE;
6637 	protocol_family_t old_proto_family = proto_family;
6638 	struct sockaddr_in6 dest6;
6639 	struct rtentry *rt = NULL;
6640 	u_int32_t m_loop_set = 0;
6641 
6642 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6643 
6644 	/*
6645 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6646 	 * from happening while this operation is in progress
6647 	 */
6648 	if (!ifnet_datamov_begin(ifp)) {
6649 		retval = ENXIO;
6650 		goto cleanup;
6651 	}
6652 	iorefcnt = 1;
6653 
6654 	VERIFY(ifp->if_output_dlil != NULL);
6655 
6656 	/* update the driver's multicast filter, if needed */
6657 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6658 		ifp->if_updatemcasts = 0;
6659 	}
6660 
6661 	frame_type = frame_type_buffer;
6662 	dst_linkaddr = dst_linkaddr_buffer;
6663 
6664 	if (raw == 0) {
6665 		ifnet_lock_shared(ifp);
6666 		/* callee holds a proto refcnt upon success */
6667 		proto = find_attached_proto(ifp, proto_family);
6668 		if (proto == NULL) {
6669 			ifnet_lock_done(ifp);
6670 			retval = ENXIO;
6671 			goto cleanup;
6672 		}
6673 		ifnet_lock_done(ifp);
6674 	}
6675 
6676 preout_again:
6677 	if (packetlist == NULL) {
6678 		goto cleanup;
6679 	}
6680 
6681 	m = packetlist;
6682 	packetlist = packetlist->m_nextpkt;
6683 	m->m_nextpkt = NULL;
6684 
6685 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6686 
6687 	/*
6688 	 * Perform address family translation for the first
6689 	 * packet outside the loop in order to perform address
6690 	 * lookup for the translated proto family.
6691 	 */
6692 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6693 	    (ifp->if_type == IFT_CELLULAR ||
6694 	    dlil_is_clat_needed(proto_family, m))) {
6695 		retval = dlil_clat46(ifp, &proto_family, &m);
6696 		/*
6697 		 * Go to the next packet if translation fails
6698 		 */
6699 		if (retval != 0) {
6700 			m_freem(m);
6701 			m = NULL;
6702 			ip6stat.ip6s_clat464_out_drop++;
6703 			/* Make sure that the proto family is PF_INET */
6704 			ASSERT(proto_family == PF_INET);
6705 			goto preout_again;
6706 		}
6707 		/*
6708 		 * Free the old one and make it point to the IPv6 proto structure.
6709 		 *
6710 		 * Change proto for the first time we have successfully
6711 		 * performed address family translation.
6712 		 */
6713 		if (!did_clat46 && proto_family == PF_INET6) {
6714 			did_clat46 = TRUE;
6715 
6716 			if (proto != NULL) {
6717 				if_proto_free(proto);
6718 			}
6719 			ifnet_lock_shared(ifp);
6720 			/* callee holds a proto refcnt upon success */
6721 			proto = find_attached_proto(ifp, proto_family);
6722 			if (proto == NULL) {
6723 				ifnet_lock_done(ifp);
6724 				retval = ENXIO;
6725 				m_freem(m);
6726 				m = NULL;
6727 				goto cleanup;
6728 			}
6729 			ifnet_lock_done(ifp);
6730 			if (ifp->if_type == IFT_ETHER) {
6731 				/* Update the dest to translated v6 address */
6732 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6733 				dest6.sin6_family = AF_INET6;
6734 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6735 				dest = (const struct sockaddr *)&dest6;
6736 
6737 				/*
6738 				 * Lookup route to the translated destination
6739 				 * Free this route ref during cleanup
6740 				 */
6741 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6742 				    0, 0, ifp->if_index);
6743 
6744 				route = rt;
6745 			}
6746 		}
6747 	}
6748 
6749 	/*
6750 	 * This path gets packet chain going to the same destination.
6751 	 * The pre output routine is used to either trigger resolution of
6752 	 * the next hop or retreive the next hop's link layer addressing.
6753 	 * For ex: ether_inet(6)_pre_output routine.
6754 	 *
6755 	 * If the routine returns EJUSTRETURN, it implies that packet has
6756 	 * been queued, and therefore we have to call preout_again for the
6757 	 * following packet in the chain.
6758 	 *
6759 	 * For errors other than EJUSTRETURN, the current packet is freed
6760 	 * and the rest of the chain (pointed by packetlist is freed as
6761 	 * part of clean up.
6762 	 *
6763 	 * Else if there is no error the retrieved information is used for
6764 	 * all the packets in the chain.
6765 	 */
6766 	if (raw == 0) {
6767 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6768 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6769 		retval = 0;
6770 		if (preoutp != NULL) {
6771 			retval = preoutp(ifp, proto_family, &m, dest, route,
6772 			    frame_type, dst_linkaddr);
6773 
6774 			if (retval != 0) {
6775 				if (retval == EJUSTRETURN) {
6776 					goto preout_again;
6777 				}
6778 				m_freem(m);
6779 				m = NULL;
6780 				goto cleanup;
6781 			}
6782 		}
6783 	}
6784 
6785 	do {
6786 		/*
6787 		 * pkt_hdr is set here to point to m_data prior to
6788 		 * calling into the framer. This value of pkt_hdr is
6789 		 * used by the netif gso logic to retrieve the ip header
6790 		 * for the TCP packets, offloaded for TSO processing.
6791 		 */
6792 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6793 			uint8_t vlan_encap_len = 0;
6794 
6795 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6796 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6797 			}
6798 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6799 		} else {
6800 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6801 		}
6802 
6803 		/*
6804 		 * Perform address family translation if needed.
6805 		 * For now we only support stateless 4 to 6 translation
6806 		 * on the out path.
6807 		 *
6808 		 * The routine below translates IP header, updates protocol
6809 		 * checksum and also translates ICMP.
6810 		 *
6811 		 * We skip the first packet as it is already translated and
6812 		 * the proto family is set to PF_INET6.
6813 		 */
6814 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6815 		    (ifp->if_type == IFT_CELLULAR ||
6816 		    dlil_is_clat_needed(proto_family, m))) {
6817 			retval = dlil_clat46(ifp, &proto_family, &m);
6818 			/* Goto the next packet if the translation fails */
6819 			if (retval != 0) {
6820 				m_freem(m);
6821 				m = NULL;
6822 				ip6stat.ip6s_clat464_out_drop++;
6823 				goto next;
6824 			}
6825 		}
6826 
6827 #if CONFIG_DTRACE
6828 		if (!raw) {
6829 			dlil_output_dtrace(ifp, proto_family, m);
6830 		}
6831 #endif /* CONFIG_DTRACE */
6832 
6833 		if (raw == 0 && ifp->if_framer != NULL) {
6834 			int rcvif_set = 0;
6835 
6836 			/*
6837 			 * If this is a broadcast packet that needs to be
6838 			 * looped back into the system, set the inbound ifp
6839 			 * to that of the outbound ifp.  This will allow
6840 			 * us to determine that it is a legitimate packet
6841 			 * for the system.  Only set the ifp if it's not
6842 			 * already set, just to be safe.
6843 			 */
6844 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6845 			    m->m_pkthdr.rcvif == NULL) {
6846 				m->m_pkthdr.rcvif = ifp;
6847 				rcvif_set = 1;
6848 			}
6849 			m_loop_set = m->m_flags & M_LOOP;
6850 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6851 			    frame_type, &pre, &post);
6852 			if (retval != 0) {
6853 				if (retval != EJUSTRETURN) {
6854 					m_freem(m);
6855 				}
6856 				goto next;
6857 			}
6858 
6859 			/*
6860 			 * For partial checksum offload, adjust the start
6861 			 * and stuff offsets based on the prepended header.
6862 			 */
6863 			if ((m->m_pkthdr.csum_flags &
6864 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6865 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6866 				m->m_pkthdr.csum_tx_stuff += pre;
6867 				m->m_pkthdr.csum_tx_start += pre;
6868 			}
6869 
6870 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6871 				dlil_output_cksum_dbg(ifp, m, pre,
6872 				    proto_family);
6873 			}
6874 
6875 			/*
6876 			 * Clear the ifp if it was set above, and to be
6877 			 * safe, only if it is still the same as the
6878 			 * outbound ifp we have in context.  If it was
6879 			 * looped back, then a copy of it was sent to the
6880 			 * loopback interface with the rcvif set, and we
6881 			 * are clearing the one that will go down to the
6882 			 * layer below.
6883 			 */
6884 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6885 				m->m_pkthdr.rcvif = NULL;
6886 			}
6887 		}
6888 
6889 		/*
6890 		 * Let interface filters (if any) do their thing ...
6891 		 */
6892 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6893 		if (retval != 0) {
6894 			if (retval != EJUSTRETURN) {
6895 				m_freem(m);
6896 			}
6897 			goto next;
6898 		}
6899 		/*
6900 		 * Strip away M_PROTO1 bit prior to sending packet
6901 		 * to the driver as this field may be used by the driver
6902 		 */
6903 		m->m_flags &= ~M_PROTO1;
6904 
6905 		/*
6906 		 * If the underlying interface is not capable of handling a
6907 		 * packet whose data portion spans across physically disjoint
6908 		 * pages, we need to "normalize" the packet so that we pass
6909 		 * down a chain of mbufs where each mbuf points to a span that
6910 		 * resides in the system page boundary.  If the packet does
6911 		 * not cross page(s), the following is a no-op.
6912 		 */
6913 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6914 			if ((m = m_normalize(m)) == NULL) {
6915 				goto next;
6916 			}
6917 		}
6918 
6919 		/*
6920 		 * If this is a TSO packet, make sure the interface still
6921 		 * advertise TSO capability.
6922 		 */
6923 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6924 			retval = EMSGSIZE;
6925 			m_freem(m);
6926 			goto cleanup;
6927 		}
6928 
6929 		ifp_inc_traffic_class_out(ifp, m);
6930 
6931 #if SKYWALK
6932 		/*
6933 		 * For native skywalk devices, packets will be passed to pktap
6934 		 * after GSO or after the mbuf to packet conversion.
6935 		 * This is done for IPv4/IPv6 packets only because there is no
6936 		 * space in the mbuf to pass down the proto family.
6937 		 */
6938 		if (dlil_is_native_netif_nexus(ifp)) {
6939 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6940 				pktap_output(ifp, proto_family, m, pre, post);
6941 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6942 			}
6943 		} else {
6944 			pktap_output(ifp, proto_family, m, pre, post);
6945 		}
6946 #else /* SKYWALK */
6947 		pktap_output(ifp, proto_family, m, pre, post);
6948 #endif /* SKYWALK */
6949 
6950 		/*
6951 		 * Count the number of elements in the mbuf chain
6952 		 */
6953 		if (tx_chain_len_count) {
6954 			dlil_count_chain_len(m, &tx_chain_len_stats);
6955 		}
6956 
6957 		/*
6958 		 * Record timestamp; ifnet_enqueue() will use this info
6959 		 * rather than redoing the work.  An optimization could
6960 		 * involve doing this just once at the top, if there are
6961 		 * no interface filters attached, but that's probably
6962 		 * not a big deal.
6963 		 */
6964 		nanouptime(&now);
6965 		net_timernsec(&now, &now_nsec);
6966 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6967 
6968 		/*
6969 		 * Discard partial sum information if this packet originated
6970 		 * from another interface; the packet would already have the
6971 		 * final checksum and we shouldn't recompute it.
6972 		 */
6973 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6974 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6975 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6976 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6977 			m->m_pkthdr.csum_data = 0;
6978 		}
6979 
6980 		/*
6981 		 * Finally, call the driver.
6982 		 */
6983 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6984 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6985 				flen += (m_pktlen(m) - (pre + post));
6986 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6987 			}
6988 			*send_tail = m;
6989 			send_tail = &m->m_nextpkt;
6990 		} else {
6991 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6992 				flen = (m_pktlen(m) - (pre + post));
6993 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6994 			} else {
6995 				flen = 0;
6996 			}
6997 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6998 			    0, 0, 0, 0, 0);
6999 			retval = (*ifp->if_output_dlil)(ifp, m);
7000 			if (retval == EQFULL || retval == EQSUSPENDED) {
7001 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7002 					adv->code = (retval == EQFULL ?
7003 					    FADV_FLOW_CONTROLLED :
7004 					    FADV_SUSPENDED);
7005 				}
7006 				retval = 0;
7007 			}
7008 			if (retval == 0 && flen > 0) {
7009 				fbytes += flen;
7010 				fpkts++;
7011 			}
7012 			if (retval != 0 && dlil_verbose) {
7013 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7014 				    __func__, if_name(ifp),
7015 				    retval);
7016 			}
7017 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7018 			    0, 0, 0, 0, 0);
7019 		}
7020 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7021 
7022 next:
7023 		m = packetlist;
7024 		if (m != NULL) {
7025 			m->m_flags |= m_loop_set;
7026 			packetlist = packetlist->m_nextpkt;
7027 			m->m_nextpkt = NULL;
7028 		}
7029 		/* Reset the proto family to old proto family for CLAT */
7030 		if (did_clat46) {
7031 			proto_family = old_proto_family;
7032 		}
7033 	} while (m != NULL);
7034 
7035 	if (send_head != NULL) {
7036 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7037 		    0, 0, 0, 0, 0);
7038 		if (ifp->if_eflags & IFEF_SENDLIST) {
7039 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7040 			if (retval == EQFULL || retval == EQSUSPENDED) {
7041 				if (adv != NULL) {
7042 					adv->code = (retval == EQFULL ?
7043 					    FADV_FLOW_CONTROLLED :
7044 					    FADV_SUSPENDED);
7045 				}
7046 				retval = 0;
7047 			}
7048 			if (retval == 0 && flen > 0) {
7049 				fbytes += flen;
7050 				fpkts++;
7051 			}
7052 			if (retval != 0 && dlil_verbose) {
7053 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7054 				    __func__, if_name(ifp), retval);
7055 			}
7056 		} else {
7057 			struct mbuf *send_m;
7058 			int enq_cnt = 0;
7059 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7060 			while (send_head != NULL) {
7061 				send_m = send_head;
7062 				send_head = send_m->m_nextpkt;
7063 				send_m->m_nextpkt = NULL;
7064 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7065 				if (retval == EQFULL || retval == EQSUSPENDED) {
7066 					if (adv != NULL) {
7067 						adv->code = (retval == EQFULL ?
7068 						    FADV_FLOW_CONTROLLED :
7069 						    FADV_SUSPENDED);
7070 					}
7071 					retval = 0;
7072 				}
7073 				if (retval == 0) {
7074 					enq_cnt++;
7075 					if (flen > 0) {
7076 						fpkts++;
7077 					}
7078 				}
7079 				if (retval != 0 && dlil_verbose) {
7080 					DLIL_PRINTF("%s: output error on %s "
7081 					    "retval = %d\n",
7082 					    __func__, if_name(ifp), retval);
7083 				}
7084 			}
7085 			if (enq_cnt > 0) {
7086 				fbytes += flen;
7087 				ifnet_start(ifp);
7088 			}
7089 		}
7090 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7091 	}
7092 
7093 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7094 
7095 cleanup:
7096 	if (fbytes > 0) {
7097 		ifp->if_fbytes += fbytes;
7098 	}
7099 	if (fpkts > 0) {
7100 		ifp->if_fpackets += fpkts;
7101 	}
7102 	if (proto != NULL) {
7103 		if_proto_free(proto);
7104 	}
7105 	if (packetlist) { /* if any packets are left, clean up */
7106 		mbuf_freem_list(packetlist);
7107 	}
7108 	if (retval == EJUSTRETURN) {
7109 		retval = 0;
7110 	}
7111 	if (iorefcnt == 1) {
7112 		ifnet_datamov_end(ifp);
7113 	}
7114 	if (rt != NULL) {
7115 		rtfree(rt);
7116 		rt = NULL;
7117 	}
7118 
7119 	return retval;
7120 }
7121 
7122 /*
7123  * This routine checks if the destination address is not a loopback, link-local,
7124  * multicast or broadcast address.
7125  */
7126 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7127 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7128 {
7129 	int ret = 0;
7130 	switch (proto_family) {
7131 	case PF_INET: {
7132 		struct ip *iph = mtod(m, struct ip *);
7133 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7134 			ret = 1;
7135 		}
7136 		break;
7137 	}
7138 	case PF_INET6: {
7139 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7140 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7141 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7142 			ret = 1;
7143 		}
7144 		break;
7145 	}
7146 	}
7147 
7148 	return ret;
7149 }
7150 /*
7151  * @brief This routine translates IPv4 packet to IPv6 packet,
7152  *     updates protocol checksum and also translates ICMP for code
7153  *     along with inner header translation.
7154  *
7155  * @param ifp Pointer to the interface
7156  * @param proto_family pointer to protocol family. It is updated if function
7157  *     performs the translation successfully.
7158  * @param m Pointer to the pointer pointing to the packet. Needed because this
7159  *     routine can end up changing the mbuf to a different one.
7160  *
7161  * @return 0 on success or else a negative value.
7162  */
7163 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7164 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7165 {
7166 	VERIFY(*proto_family == PF_INET);
7167 	VERIFY(IS_INTF_CLAT46(ifp));
7168 
7169 	pbuf_t pbuf_store, *pbuf = NULL;
7170 	struct ip *iph = NULL;
7171 	struct in_addr osrc, odst;
7172 	uint8_t proto = 0;
7173 	struct in6_ifaddr *ia6_clat_src = NULL;
7174 	struct in6_addr *src = NULL;
7175 	struct in6_addr dst;
7176 	int error = 0;
7177 	uint16_t off = 0;
7178 	uint16_t tot_len = 0;
7179 	uint16_t ip_id_val = 0;
7180 	uint16_t ip_frag_off = 0;
7181 
7182 	boolean_t is_frag = FALSE;
7183 	boolean_t is_first_frag = TRUE;
7184 	boolean_t is_last_frag = TRUE;
7185 
7186 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7187 	pbuf = &pbuf_store;
7188 	iph = pbuf->pb_data;
7189 
7190 	osrc = iph->ip_src;
7191 	odst = iph->ip_dst;
7192 	proto = iph->ip_p;
7193 	off = (uint16_t)(iph->ip_hl << 2);
7194 	ip_id_val = iph->ip_id;
7195 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7196 
7197 	tot_len = ntohs(iph->ip_len);
7198 
7199 	/*
7200 	 * For packets that are not first frags
7201 	 * we only need to adjust CSUM.
7202 	 * For 4 to 6, Fragmentation header gets appended
7203 	 * after proto translation.
7204 	 */
7205 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7206 		is_frag = TRUE;
7207 
7208 		/* If the offset is not zero, it is not first frag */
7209 		if (ip_frag_off != 0) {
7210 			is_first_frag = FALSE;
7211 		}
7212 
7213 		/* If IP_MF is set, then it is not last frag */
7214 		if (ntohs(iph->ip_off) & IP_MF) {
7215 			is_last_frag = FALSE;
7216 		}
7217 	}
7218 
7219 	/*
7220 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7221 	 * translation.
7222 	 */
7223 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7224 	if (ia6_clat_src == NULL) {
7225 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7226 		error = -1;
7227 		goto cleanup;
7228 	}
7229 
7230 	src = &ia6_clat_src->ia_addr.sin6_addr;
7231 
7232 	/*
7233 	 * Translate IPv4 destination to IPv6 destination by using the
7234 	 * prefixes learned through prior PLAT discovery.
7235 	 */
7236 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7237 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7238 		goto cleanup;
7239 	}
7240 
7241 	/* Translate the IP header part first */
7242 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7243 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7244 
7245 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7246 
7247 	if (error != 0) {
7248 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7249 		goto cleanup;
7250 	}
7251 
7252 	/*
7253 	 * Translate protocol header, update checksum, checksum flags
7254 	 * and related fields.
7255 	 */
7256 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7257 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7258 
7259 	if (error != 0) {
7260 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7261 		goto cleanup;
7262 	}
7263 
7264 	/* Now insert the IPv6 fragment header */
7265 	if (is_frag) {
7266 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7267 
7268 		if (error != 0) {
7269 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7270 			goto cleanup;
7271 		}
7272 	}
7273 
7274 cleanup:
7275 	if (ia6_clat_src != NULL) {
7276 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7277 	}
7278 
7279 	if (pbuf_is_valid(pbuf)) {
7280 		*m = pbuf->pb_mbuf;
7281 		pbuf->pb_mbuf = NULL;
7282 		pbuf_destroy(pbuf);
7283 	} else {
7284 		error = -1;
7285 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7286 	}
7287 
7288 	if (error == 0) {
7289 		*proto_family = PF_INET6;
7290 		ip6stat.ip6s_clat464_out_success++;
7291 	}
7292 
7293 	return error;
7294 }
7295 
7296 /*
7297  * @brief This routine translates incoming IPv6 to IPv4 packet,
7298  *     updates protocol checksum and also translates ICMPv6 outer
7299  *     and inner headers
7300  *
7301  * @return 0 on success or else a negative value.
7302  */
7303 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7304 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7305 {
7306 	VERIFY(*proto_family == PF_INET6);
7307 	VERIFY(IS_INTF_CLAT46(ifp));
7308 
7309 	struct ip6_hdr *ip6h = NULL;
7310 	struct in6_addr osrc, odst;
7311 	uint8_t proto = 0;
7312 	struct in6_ifaddr *ia6_clat_dst = NULL;
7313 	struct in_ifaddr *ia4_clat_dst = NULL;
7314 	struct in_addr *dst = NULL;
7315 	struct in_addr src;
7316 	int error = 0;
7317 	uint32_t off = 0;
7318 	u_int64_t tot_len = 0;
7319 	uint8_t tos = 0;
7320 	boolean_t is_first_frag = TRUE;
7321 
7322 	/* Incoming mbuf does not contain valid IP6 header */
7323 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7324 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7325 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7326 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7327 		return -1;
7328 	}
7329 
7330 	ip6h = mtod(*m, struct ip6_hdr *);
7331 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7332 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7333 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7334 		return -1;
7335 	}
7336 
7337 	osrc = ip6h->ip6_src;
7338 	odst = ip6h->ip6_dst;
7339 
7340 	/*
7341 	 * Retrieve the local CLAT46 reserved IPv6 address.
7342 	 * Let the packet pass if we don't find one, as the flag
7343 	 * may get set before IPv6 configuration has taken place.
7344 	 */
7345 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7346 	if (ia6_clat_dst == NULL) {
7347 		goto done;
7348 	}
7349 
7350 	/*
7351 	 * Check if the original dest in the packet is same as the reserved
7352 	 * CLAT46 IPv6 address
7353 	 */
7354 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7355 		pbuf_t pbuf_store, *pbuf = NULL;
7356 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7357 		pbuf = &pbuf_store;
7358 
7359 		/*
7360 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7361 		 * translation.
7362 		 */
7363 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7364 		if (ia4_clat_dst == NULL) {
7365 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7366 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7367 			error = -1;
7368 			goto cleanup;
7369 		}
7370 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7371 
7372 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7373 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7374 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7375 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7376 			error = -1;
7377 			goto cleanup;
7378 		}
7379 
7380 		ip6h = pbuf->pb_data;
7381 		off = sizeof(struct ip6_hdr);
7382 		proto = ip6h->ip6_nxt;
7383 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7384 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7385 
7386 		/*
7387 		 * Translate the IP header and update the fragmentation
7388 		 * header if needed
7389 		 */
7390 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7391 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7392 		    0 : -1;
7393 
7394 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7395 
7396 		if (error != 0) {
7397 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7398 			goto cleanup;
7399 		}
7400 
7401 		/*
7402 		 * Translate protocol header, update checksum, checksum flags
7403 		 * and related fields.
7404 		 */
7405 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7406 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7407 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7408 
7409 		if (error != 0) {
7410 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7411 			goto cleanup;
7412 		}
7413 
7414 cleanup:
7415 		if (ia4_clat_dst != NULL) {
7416 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7417 		}
7418 
7419 		if (pbuf_is_valid(pbuf)) {
7420 			*m = pbuf->pb_mbuf;
7421 			pbuf->pb_mbuf = NULL;
7422 			pbuf_destroy(pbuf);
7423 		} else {
7424 			error = -1;
7425 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7426 		}
7427 
7428 		if (error == 0) {
7429 			*proto_family = PF_INET;
7430 			ip6stat.ip6s_clat464_in_success++;
7431 		}
7432 	} /* CLAT traffic */
7433 
7434 done:
7435 	return error;
7436 }
7437 
7438 /* The following is used to enqueue work items for ifnet ioctl events */
7439 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7440 
7441 struct ifnet_ioctl_event {
7442 	struct ifnet *ifp;
7443 	u_long ioctl_code;
7444 };
7445 
7446 struct ifnet_ioctl_event_nwk_wq_entry {
7447 	struct nwk_wq_entry nwk_wqe;
7448 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7449 };
7450 
7451 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7452 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7453 {
7454 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7455 
7456 	/*
7457 	 * Get an io ref count if the interface is attached.
7458 	 * At this point it most likely is. We are taking a reference for
7459 	 * deferred processing.
7460 	 */
7461 	if (!ifnet_is_attached(ifp, 1)) {
7462 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7463 		    "is not attached",
7464 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7465 		return;
7466 	}
7467 
7468 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7469 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7470 
7471 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7472 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7473 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7474 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7475 }
7476 
7477 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7478 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7479 {
7480 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7481 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7482 
7483 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7484 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7485 	int ret = 0;
7486 
7487 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7488 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7489 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7490 	} else if (dlil_verbose) {
7491 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7492 		    "for ioctl %lu",
7493 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7494 	}
7495 	ifnet_decr_iorefcnt(ifp);
7496 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7497 	return;
7498 }
7499 
7500 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7501 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7502     void *ioctl_arg)
7503 {
7504 	struct ifnet_filter *filter;
7505 	int retval = EOPNOTSUPP;
7506 	int result = 0;
7507 
7508 	if (ifp == NULL || ioctl_code == 0) {
7509 		return EINVAL;
7510 	}
7511 
7512 	/* Get an io ref count if the interface is attached */
7513 	if (!ifnet_is_attached(ifp, 1)) {
7514 		return EOPNOTSUPP;
7515 	}
7516 
7517 	/*
7518 	 * Run the interface filters first.
7519 	 * We want to run all filters before calling the protocol,
7520 	 * interface family, or interface.
7521 	 */
7522 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7523 	/* prevent filter list from changing in case we drop the lock */
7524 	if_flt_monitor_busy(ifp);
7525 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7526 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7527 		    filter->filt_protocol == proto_fam)) {
7528 			lck_mtx_unlock(&ifp->if_flt_lock);
7529 
7530 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7531 			    proto_fam, ioctl_code, ioctl_arg);
7532 
7533 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7534 
7535 			/* Only update retval if no one has handled the ioctl */
7536 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7537 				if (result == ENOTSUP) {
7538 					result = EOPNOTSUPP;
7539 				}
7540 				retval = result;
7541 				if (retval != 0 && retval != EOPNOTSUPP) {
7542 					/* we're done with the filter list */
7543 					if_flt_monitor_unbusy(ifp);
7544 					lck_mtx_unlock(&ifp->if_flt_lock);
7545 					goto cleanup;
7546 				}
7547 			}
7548 		}
7549 	}
7550 	/* we're done with the filter list */
7551 	if_flt_monitor_unbusy(ifp);
7552 	lck_mtx_unlock(&ifp->if_flt_lock);
7553 
7554 	/* Allow the protocol to handle the ioctl */
7555 	if (proto_fam != 0) {
7556 		struct if_proto *proto;
7557 
7558 		/* callee holds a proto refcnt upon success */
7559 		ifnet_lock_shared(ifp);
7560 		proto = find_attached_proto(ifp, proto_fam);
7561 		ifnet_lock_done(ifp);
7562 		if (proto != NULL) {
7563 			proto_media_ioctl ioctlp =
7564 			    (proto->proto_kpi == kProtoKPI_v1 ?
7565 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7566 			result = EOPNOTSUPP;
7567 			if (ioctlp != NULL) {
7568 				result = ioctlp(ifp, proto_fam, ioctl_code,
7569 				    ioctl_arg);
7570 			}
7571 			if_proto_free(proto);
7572 
7573 			/* Only update retval if no one has handled the ioctl */
7574 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7575 				if (result == ENOTSUP) {
7576 					result = EOPNOTSUPP;
7577 				}
7578 				retval = result;
7579 				if (retval && retval != EOPNOTSUPP) {
7580 					goto cleanup;
7581 				}
7582 			}
7583 		}
7584 	}
7585 
7586 	/* retval is either 0 or EOPNOTSUPP */
7587 
7588 	/*
7589 	 * Let the interface handle this ioctl.
7590 	 * If it returns EOPNOTSUPP, ignore that, we may have
7591 	 * already handled this in the protocol or family.
7592 	 */
7593 	if (ifp->if_ioctl) {
7594 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7595 	}
7596 
7597 	/* Only update retval if no one has handled the ioctl */
7598 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7599 		if (result == ENOTSUP) {
7600 			result = EOPNOTSUPP;
7601 		}
7602 		retval = result;
7603 		if (retval && retval != EOPNOTSUPP) {
7604 			goto cleanup;
7605 		}
7606 	}
7607 
7608 cleanup:
7609 	if (retval == EJUSTRETURN) {
7610 		retval = 0;
7611 	}
7612 
7613 	ifnet_decr_iorefcnt(ifp);
7614 
7615 	return retval;
7616 }
7617 
7618 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7619 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7620 {
7621 	errno_t error = 0;
7622 
7623 
7624 	if (ifp->if_set_bpf_tap) {
7625 		/* Get an io reference on the interface if it is attached */
7626 		if (!ifnet_is_attached(ifp, 1)) {
7627 			return ENXIO;
7628 		}
7629 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7630 		ifnet_decr_iorefcnt(ifp);
7631 	}
7632 	return error;
7633 }
7634 
7635 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7636 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7637     struct sockaddr *ll_addr, size_t ll_len)
7638 {
7639 	errno_t result = EOPNOTSUPP;
7640 	struct if_proto *proto;
7641 	const struct sockaddr *verify;
7642 	proto_media_resolve_multi resolvep;
7643 
7644 	if (!ifnet_is_attached(ifp, 1)) {
7645 		return result;
7646 	}
7647 
7648 	bzero(ll_addr, ll_len);
7649 
7650 	/* Call the protocol first; callee holds a proto refcnt upon success */
7651 	ifnet_lock_shared(ifp);
7652 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7653 	ifnet_lock_done(ifp);
7654 	if (proto != NULL) {
7655 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7656 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7657 		if (resolvep != NULL) {
7658 			result = resolvep(ifp, proto_addr,
7659 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7660 		}
7661 		if_proto_free(proto);
7662 	}
7663 
7664 	/* Let the interface verify the multicast address */
7665 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7666 		if (result == 0) {
7667 			verify = ll_addr;
7668 		} else {
7669 			verify = proto_addr;
7670 		}
7671 		result = ifp->if_check_multi(ifp, verify);
7672 	}
7673 
7674 	ifnet_decr_iorefcnt(ifp);
7675 	return result;
7676 }
7677 
7678 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7679 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7680     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7681     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7682 {
7683 	struct if_proto *proto;
7684 	errno_t result = 0;
7685 
7686 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7687 		result = ENOTSUP;
7688 		goto done;
7689 	}
7690 
7691 	/* callee holds a proto refcnt upon success */
7692 	ifnet_lock_shared(ifp);
7693 	proto = find_attached_proto(ifp, target_proto->sa_family);
7694 	ifnet_lock_done(ifp);
7695 	if (proto == NULL) {
7696 		result = ENOTSUP;
7697 	} else {
7698 		proto_media_send_arp    arpp;
7699 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7700 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7701 		if (arpp == NULL) {
7702 			result = ENOTSUP;
7703 		} else {
7704 			switch (arpop) {
7705 			case ARPOP_REQUEST:
7706 				arpstat.txrequests++;
7707 				if (target_hw != NULL) {
7708 					arpstat.txurequests++;
7709 				}
7710 				break;
7711 			case ARPOP_REPLY:
7712 				arpstat.txreplies++;
7713 				break;
7714 			}
7715 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7716 			    target_hw, target_proto);
7717 		}
7718 		if_proto_free(proto);
7719 	}
7720 done:
7721 	return result;
7722 }
7723 
7724 struct net_thread_marks { };
7725 static const struct net_thread_marks net_thread_marks_base = { };
7726 
7727 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7728     &net_thread_marks_base;
7729 
7730 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7731 net_thread_marks_push(u_int32_t push)
7732 {
7733 	static const char *const base = (const void*)&net_thread_marks_base;
7734 	u_int32_t pop = 0;
7735 
7736 	if (push != 0) {
7737 		struct uthread *uth = current_uthread();
7738 
7739 		pop = push & ~uth->uu_network_marks;
7740 		if (pop != 0) {
7741 			uth->uu_network_marks |= pop;
7742 		}
7743 	}
7744 
7745 	return (net_thread_marks_t)&base[pop];
7746 }
7747 
7748 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7749 net_thread_unmarks_push(u_int32_t unpush)
7750 {
7751 	static const char *const base = (const void*)&net_thread_marks_base;
7752 	u_int32_t unpop = 0;
7753 
7754 	if (unpush != 0) {
7755 		struct uthread *uth = current_uthread();
7756 
7757 		unpop = unpush & uth->uu_network_marks;
7758 		if (unpop != 0) {
7759 			uth->uu_network_marks &= ~unpop;
7760 		}
7761 	}
7762 
7763 	return (net_thread_marks_t)&base[unpop];
7764 }
7765 
7766 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7767 net_thread_marks_pop(net_thread_marks_t popx)
7768 {
7769 	static const char *const base = (const void*)&net_thread_marks_base;
7770 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7771 
7772 	if (pop != 0) {
7773 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7774 		struct uthread *uth = current_uthread();
7775 
7776 		VERIFY((pop & ones) == pop);
7777 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7778 		uth->uu_network_marks &= ~pop;
7779 	}
7780 }
7781 
7782 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7783 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7784 {
7785 	static const char *const base = (const void*)&net_thread_marks_base;
7786 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7787 
7788 	if (unpop != 0) {
7789 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7790 		struct uthread *uth = current_uthread();
7791 
7792 		VERIFY((unpop & ones) == unpop);
7793 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7794 		uth->uu_network_marks |= unpop;
7795 	}
7796 }
7797 
7798 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7799 net_thread_is_marked(u_int32_t check)
7800 {
7801 	if (check != 0) {
7802 		struct uthread *uth = current_uthread();
7803 		return uth->uu_network_marks & check;
7804 	} else {
7805 		return 0;
7806 	}
7807 }
7808 
7809 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7810 net_thread_is_unmarked(u_int32_t check)
7811 {
7812 	if (check != 0) {
7813 		struct uthread *uth = current_uthread();
7814 		return ~uth->uu_network_marks & check;
7815 	} else {
7816 		return 0;
7817 	}
7818 }
7819 
7820 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7821 _is_announcement(const struct sockaddr_in * sender_sin,
7822     const struct sockaddr_in * target_sin)
7823 {
7824 	if (target_sin == NULL || sender_sin == NULL) {
7825 		return FALSE;
7826 	}
7827 
7828 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7829 }
7830 
7831 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7832 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7833     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7834     const struct sockaddr *target_proto0, u_int32_t rtflags)
7835 {
7836 	errno_t result = 0;
7837 	const struct sockaddr_in * sender_sin;
7838 	const struct sockaddr_in * target_sin;
7839 	struct sockaddr_inarp target_proto_sinarp;
7840 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7841 
7842 	if (target_proto == NULL || sender_proto == NULL) {
7843 		return EINVAL;
7844 	}
7845 
7846 	if (sender_proto->sa_family != target_proto->sa_family) {
7847 		return EINVAL;
7848 	}
7849 
7850 	/*
7851 	 * If the target is a (default) router, provide that
7852 	 * information to the send_arp callback routine.
7853 	 */
7854 	if (rtflags & RTF_ROUTER) {
7855 		bcopy(target_proto, &target_proto_sinarp,
7856 		    sizeof(struct sockaddr_in));
7857 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7858 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7859 	}
7860 
7861 	/*
7862 	 * If this is an ARP request and the target IP is IPv4LL,
7863 	 * send the request on all interfaces.  The exception is
7864 	 * an announcement, which must only appear on the specific
7865 	 * interface.
7866 	 */
7867 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7868 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7869 	if (target_proto->sa_family == AF_INET &&
7870 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7871 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7872 	    !_is_announcement(sender_sin, target_sin)) {
7873 		ifnet_t         *ifp_list;
7874 		u_int32_t       count;
7875 		u_int32_t       ifp_on;
7876 
7877 		result = ENOTSUP;
7878 
7879 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7880 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7881 				errno_t new_result;
7882 				ifaddr_t source_hw = NULL;
7883 				ifaddr_t source_ip = NULL;
7884 				struct sockaddr_in source_ip_copy;
7885 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7886 
7887 				/*
7888 				 * Only arp on interfaces marked for IPv4LL
7889 				 * ARPing.  This may mean that we don't ARP on
7890 				 * the interface the subnet route points to.
7891 				 */
7892 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7893 					continue;
7894 				}
7895 
7896 				/* Find the source IP address */
7897 				ifnet_lock_shared(cur_ifp);
7898 				source_hw = cur_ifp->if_lladdr;
7899 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7900 				    ifa_link) {
7901 					IFA_LOCK(source_ip);
7902 					if (source_ip->ifa_addr != NULL &&
7903 					    source_ip->ifa_addr->sa_family ==
7904 					    AF_INET) {
7905 						/* Copy the source IP address */
7906 						source_ip_copy =
7907 						    *(struct sockaddr_in *)
7908 						    (void *)source_ip->ifa_addr;
7909 						IFA_UNLOCK(source_ip);
7910 						break;
7911 					}
7912 					IFA_UNLOCK(source_ip);
7913 				}
7914 
7915 				/* No IP Source, don't arp */
7916 				if (source_ip == NULL) {
7917 					ifnet_lock_done(cur_ifp);
7918 					continue;
7919 				}
7920 
7921 				IFA_ADDREF(source_hw);
7922 				ifnet_lock_done(cur_ifp);
7923 
7924 				/* Send the ARP */
7925 				new_result = dlil_send_arp_internal(cur_ifp,
7926 				    arpop, (struct sockaddr_dl *)(void *)
7927 				    source_hw->ifa_addr,
7928 				    (struct sockaddr *)&source_ip_copy, NULL,
7929 				    target_proto);
7930 
7931 				IFA_REMREF(source_hw);
7932 				if (result == ENOTSUP) {
7933 					result = new_result;
7934 				}
7935 			}
7936 			ifnet_list_free(ifp_list);
7937 		}
7938 	} else {
7939 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7940 		    sender_proto, target_hw, target_proto);
7941 	}
7942 
7943 	return result;
7944 }
7945 
7946 /*
7947  * Caller must hold ifnet head lock.
7948  */
7949 static int
ifnet_lookup(struct ifnet * ifp)7950 ifnet_lookup(struct ifnet *ifp)
7951 {
7952 	struct ifnet *_ifp;
7953 
7954 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7955 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7956 		if (_ifp == ifp) {
7957 			break;
7958 		}
7959 	}
7960 	return _ifp != NULL;
7961 }
7962 
7963 /*
7964  * Caller has to pass a non-zero refio argument to get a
7965  * IO reference count. This will prevent ifnet_detach from
7966  * being called when there are outstanding io reference counts.
7967  */
7968 int
ifnet_is_attached(struct ifnet * ifp,int refio)7969 ifnet_is_attached(struct ifnet *ifp, int refio)
7970 {
7971 	int ret;
7972 
7973 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7974 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7975 		if (refio > 0) {
7976 			ifp->if_refio++;
7977 		}
7978 	}
7979 	lck_mtx_unlock(&ifp->if_ref_lock);
7980 
7981 	return ret;
7982 }
7983 
7984 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7985 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7986 {
7987 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7988 	ifp->if_threads_pending++;
7989 	lck_mtx_unlock(&ifp->if_ref_lock);
7990 }
7991 
7992 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7993 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7994 {
7995 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7996 	VERIFY(ifp->if_threads_pending > 0);
7997 	ifp->if_threads_pending--;
7998 	if (ifp->if_threads_pending == 0) {
7999 		wakeup(&ifp->if_threads_pending);
8000 	}
8001 	lck_mtx_unlock(&ifp->if_ref_lock);
8002 }
8003 
8004 /*
8005  * Caller must ensure the interface is attached; the assumption is that
8006  * there is at least an outstanding IO reference count held already.
8007  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8008  */
8009 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8010 ifnet_incr_iorefcnt(struct ifnet *ifp)
8011 {
8012 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8013 	VERIFY(IF_FULLY_ATTACHED(ifp));
8014 	VERIFY(ifp->if_refio > 0);
8015 	ifp->if_refio++;
8016 	lck_mtx_unlock(&ifp->if_ref_lock);
8017 }
8018 
8019 __attribute__((always_inline))
8020 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8021 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8022 {
8023 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8024 
8025 	VERIFY(ifp->if_refio > 0);
8026 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8027 
8028 	ifp->if_refio--;
8029 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8030 
8031 	/*
8032 	 * if there are no more outstanding io references, wakeup the
8033 	 * ifnet_detach thread if detaching flag is set.
8034 	 */
8035 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8036 		wakeup(&(ifp->if_refio));
8037 	}
8038 }
8039 
8040 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8041 ifnet_decr_iorefcnt(struct ifnet *ifp)
8042 {
8043 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8044 	ifnet_decr_iorefcnt_locked(ifp);
8045 	lck_mtx_unlock(&ifp->if_ref_lock);
8046 }
8047 
8048 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8049 ifnet_datamov_begin(struct ifnet *ifp)
8050 {
8051 	boolean_t ret;
8052 
8053 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8054 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8055 		ifp->if_refio++;
8056 		ifp->if_datamov++;
8057 	}
8058 	lck_mtx_unlock(&ifp->if_ref_lock);
8059 
8060 	return ret;
8061 }
8062 
8063 void
ifnet_datamov_end(struct ifnet * ifp)8064 ifnet_datamov_end(struct ifnet *ifp)
8065 {
8066 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8067 	VERIFY(ifp->if_datamov > 0);
8068 	/*
8069 	 * if there's no more thread moving data, wakeup any
8070 	 * drainers that's blocked waiting for this.
8071 	 */
8072 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8073 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8074 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8075 		wakeup(&(ifp->if_datamov));
8076 	}
8077 	ifnet_decr_iorefcnt_locked(ifp);
8078 	lck_mtx_unlock(&ifp->if_ref_lock);
8079 }
8080 
8081 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8082 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8083 {
8084 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8085 	ifp->if_refio++;
8086 	if (ifp->if_suspend++ == 0) {
8087 		VERIFY(ifp->if_refflags & IFRF_READY);
8088 		ifp->if_refflags &= ~IFRF_READY;
8089 	}
8090 }
8091 
8092 void
ifnet_datamov_suspend(struct ifnet * ifp)8093 ifnet_datamov_suspend(struct ifnet *ifp)
8094 {
8095 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8096 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8097 	ifnet_datamov_suspend_locked(ifp);
8098 	lck_mtx_unlock(&ifp->if_ref_lock);
8099 }
8100 
8101 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8102 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8103 {
8104 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8105 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8106 	if (ifp->if_suspend > 0) {
8107 		lck_mtx_unlock(&ifp->if_ref_lock);
8108 		return FALSE;
8109 	}
8110 	ifnet_datamov_suspend_locked(ifp);
8111 	lck_mtx_unlock(&ifp->if_ref_lock);
8112 	return TRUE;
8113 }
8114 
8115 void
ifnet_datamov_drain(struct ifnet * ifp)8116 ifnet_datamov_drain(struct ifnet *ifp)
8117 {
8118 	lck_mtx_lock(&ifp->if_ref_lock);
8119 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8120 	/* data movement must already be suspended */
8121 	VERIFY(ifp->if_suspend > 0);
8122 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8123 	ifp->if_drainers++;
8124 	while (ifp->if_datamov != 0) {
8125 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8126 		    if_name(ifp));
8127 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8128 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8129 		    (PZERO - 1), __func__, NULL);
8130 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8131 	}
8132 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8133 	VERIFY(ifp->if_drainers > 0);
8134 	ifp->if_drainers--;
8135 	lck_mtx_unlock(&ifp->if_ref_lock);
8136 
8137 	/* purge the interface queues */
8138 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8139 		if_qflush_snd(ifp, false);
8140 	}
8141 }
8142 
8143 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8144 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8145 {
8146 	ifnet_datamov_suspend(ifp);
8147 	ifnet_datamov_drain(ifp);
8148 }
8149 
8150 void
ifnet_datamov_resume(struct ifnet * ifp)8151 ifnet_datamov_resume(struct ifnet *ifp)
8152 {
8153 	lck_mtx_lock(&ifp->if_ref_lock);
8154 	/* data movement must already be suspended */
8155 	VERIFY(ifp->if_suspend > 0);
8156 	if (--ifp->if_suspend == 0) {
8157 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8158 		ifp->if_refflags |= IFRF_READY;
8159 	}
8160 	ifnet_decr_iorefcnt_locked(ifp);
8161 	lck_mtx_unlock(&ifp->if_ref_lock);
8162 }
8163 
8164 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8165 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8166 {
8167 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8168 	ctrace_t *tr;
8169 	u_int32_t idx;
8170 	u_int16_t *cnt;
8171 
8172 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8173 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8174 		/* NOTREACHED */
8175 	}
8176 
8177 	if (refhold) {
8178 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8179 		tr = dl_if_dbg->dldbg_if_refhold;
8180 	} else {
8181 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8182 		tr = dl_if_dbg->dldbg_if_refrele;
8183 	}
8184 
8185 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8186 	ctrace_record(&tr[idx]);
8187 }
8188 
8189 errno_t
dlil_if_ref(struct ifnet * ifp)8190 dlil_if_ref(struct ifnet *ifp)
8191 {
8192 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8193 
8194 	if (dl_if == NULL) {
8195 		return EINVAL;
8196 	}
8197 
8198 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8199 	++dl_if->dl_if_refcnt;
8200 	if (dl_if->dl_if_refcnt == 0) {
8201 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8202 		/* NOTREACHED */
8203 	}
8204 	if (dl_if->dl_if_trace != NULL) {
8205 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8206 	}
8207 	lck_mtx_unlock(&dl_if->dl_if_lock);
8208 
8209 	return 0;
8210 }
8211 
8212 errno_t
dlil_if_free(struct ifnet * ifp)8213 dlil_if_free(struct ifnet *ifp)
8214 {
8215 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8216 	bool need_release = FALSE;
8217 
8218 	if (dl_if == NULL) {
8219 		return EINVAL;
8220 	}
8221 
8222 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8223 	switch (dl_if->dl_if_refcnt) {
8224 	case 0:
8225 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8226 		/* NOTREACHED */
8227 		break;
8228 	case 1:
8229 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8230 			need_release = TRUE;
8231 		}
8232 		break;
8233 	default:
8234 		break;
8235 	}
8236 	--dl_if->dl_if_refcnt;
8237 	if (dl_if->dl_if_trace != NULL) {
8238 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8239 	}
8240 	lck_mtx_unlock(&dl_if->dl_if_lock);
8241 	if (need_release) {
8242 		_dlil_if_release(ifp, true);
8243 	}
8244 	return 0;
8245 }
8246 
8247 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8248 dlil_attach_protocol(struct if_proto *proto,
8249     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8250     uint32_t * proto_count)
8251 {
8252 	struct kev_dl_proto_data ev_pr_data;
8253 	struct ifnet *ifp = proto->ifp;
8254 	errno_t retval = 0;
8255 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8256 	struct if_proto *prev_proto;
8257 	struct if_proto *_proto;
8258 
8259 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8260 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8261 		return EINVAL;
8262 	}
8263 
8264 	if (!ifnet_is_attached(ifp, 1)) {
8265 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8266 		    __func__, if_name(ifp));
8267 		return ENXIO;
8268 	}
8269 	/* callee holds a proto refcnt upon success */
8270 	ifnet_lock_exclusive(ifp);
8271 	_proto = find_attached_proto(ifp, proto->protocol_family);
8272 	if (_proto != NULL) {
8273 		ifnet_lock_done(ifp);
8274 		if_proto_free(_proto);
8275 		retval = EEXIST;
8276 		goto ioref_done;
8277 	}
8278 
8279 	/*
8280 	 * Call family module add_proto routine so it can refine the
8281 	 * demux descriptors as it wishes.
8282 	 */
8283 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8284 	    demux_count);
8285 	if (retval) {
8286 		ifnet_lock_done(ifp);
8287 		goto ioref_done;
8288 	}
8289 
8290 	/*
8291 	 * Insert the protocol in the hash
8292 	 */
8293 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8294 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8295 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8296 	}
8297 	if (prev_proto) {
8298 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8299 	} else {
8300 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8301 		    proto, next_hash);
8302 	}
8303 
8304 	/* hold a proto refcnt for attach */
8305 	if_proto_ref(proto);
8306 
8307 	/*
8308 	 * The reserved field carries the number of protocol still attached
8309 	 * (subject to change)
8310 	 */
8311 	ev_pr_data.proto_family = proto->protocol_family;
8312 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8313 
8314 	ifnet_lock_done(ifp);
8315 
8316 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8317 	    (struct net_event_data *)&ev_pr_data,
8318 	    sizeof(struct kev_dl_proto_data), FALSE);
8319 	if (proto_count != NULL) {
8320 		*proto_count = ev_pr_data.proto_remaining_count;
8321 	}
8322 ioref_done:
8323 	ifnet_decr_iorefcnt(ifp);
8324 	return retval;
8325 }
8326 
8327 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8328 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8329 {
8330 	/*
8331 	 * A protocol has been attached, mark the interface up.
8332 	 * This used to be done by configd.KernelEventMonitor, but that
8333 	 * is inherently prone to races (rdar://problem/30810208).
8334 	 */
8335 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8336 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8337 	dlil_post_sifflags_msg(ifp);
8338 #if SKYWALK
8339 	switch (protocol) {
8340 	case AF_INET:
8341 	case AF_INET6:
8342 		/* don't attach the flowswitch unless attaching IP */
8343 		dlil_attach_flowswitch_nexus(ifp);
8344 		break;
8345 	default:
8346 		break;
8347 	}
8348 #endif /* SKYWALK */
8349 }
8350 
8351 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8352 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8353     const struct ifnet_attach_proto_param *proto_details)
8354 {
8355 	int retval = 0;
8356 	struct if_proto  *ifproto = NULL;
8357 	uint32_t proto_count = 0;
8358 
8359 	ifnet_head_lock_shared();
8360 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8361 		retval = EINVAL;
8362 		goto end;
8363 	}
8364 	/* Check that the interface is in the global list */
8365 	if (!ifnet_lookup(ifp)) {
8366 		retval = ENXIO;
8367 		goto end;
8368 	}
8369 
8370 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8371 
8372 	/* refcnt held above during lookup */
8373 	ifproto->ifp = ifp;
8374 	ifproto->protocol_family = protocol;
8375 	ifproto->proto_kpi = kProtoKPI_v1;
8376 	ifproto->kpi.v1.input = proto_details->input;
8377 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8378 	ifproto->kpi.v1.event = proto_details->event;
8379 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8380 	ifproto->kpi.v1.detached = proto_details->detached;
8381 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8382 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8383 
8384 	retval = dlil_attach_protocol(ifproto,
8385 	    proto_details->demux_list, proto_details->demux_count,
8386 	    &proto_count);
8387 
8388 end:
8389 	if (retval == EEXIST) {
8390 		/* already attached */
8391 		if (dlil_verbose) {
8392 			DLIL_PRINTF("%s: protocol %d already attached\n",
8393 			    ifp != NULL ? if_name(ifp) : "N/A",
8394 			    protocol);
8395 		}
8396 	} else if (retval != 0) {
8397 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8398 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8399 	} else if (dlil_verbose) {
8400 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8401 		    ifp != NULL ? if_name(ifp) : "N/A",
8402 		    protocol, proto_count);
8403 	}
8404 	ifnet_head_done();
8405 	if (retval == 0) {
8406 		dlil_handle_proto_attach(ifp, protocol);
8407 	} else if (ifproto != NULL) {
8408 		zfree(dlif_proto_zone, ifproto);
8409 	}
8410 	return retval;
8411 }
8412 
8413 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8414 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8415     const struct ifnet_attach_proto_param_v2 *proto_details)
8416 {
8417 	int retval = 0;
8418 	struct if_proto  *ifproto = NULL;
8419 	uint32_t proto_count = 0;
8420 
8421 	ifnet_head_lock_shared();
8422 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8423 		retval = EINVAL;
8424 		goto end;
8425 	}
8426 	/* Check that the interface is in the global list */
8427 	if (!ifnet_lookup(ifp)) {
8428 		retval = ENXIO;
8429 		goto end;
8430 	}
8431 
8432 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8433 
8434 	/* refcnt held above during lookup */
8435 	ifproto->ifp = ifp;
8436 	ifproto->protocol_family = protocol;
8437 	ifproto->proto_kpi = kProtoKPI_v2;
8438 	ifproto->kpi.v2.input = proto_details->input;
8439 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8440 	ifproto->kpi.v2.event = proto_details->event;
8441 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8442 	ifproto->kpi.v2.detached = proto_details->detached;
8443 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8444 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8445 
8446 	retval = dlil_attach_protocol(ifproto,
8447 	    proto_details->demux_list, proto_details->demux_count,
8448 	    &proto_count);
8449 
8450 end:
8451 	if (retval == EEXIST) {
8452 		/* already attached */
8453 		if (dlil_verbose) {
8454 			DLIL_PRINTF("%s: protocol %d already attached\n",
8455 			    ifp != NULL ? if_name(ifp) : "N/A",
8456 			    protocol);
8457 		}
8458 	} else if (retval != 0) {
8459 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8460 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8461 	} else if (dlil_verbose) {
8462 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8463 		    ifp != NULL ? if_name(ifp) : "N/A",
8464 		    protocol, proto_count);
8465 	}
8466 	ifnet_head_done();
8467 	if (retval == 0) {
8468 		dlil_handle_proto_attach(ifp, protocol);
8469 	} else if (ifproto != NULL) {
8470 		zfree(dlif_proto_zone, ifproto);
8471 	}
8472 	return retval;
8473 }
8474 
8475 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8476 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8477 {
8478 	struct if_proto *proto = NULL;
8479 	int     retval = 0;
8480 
8481 	if (ifp == NULL || proto_family == 0) {
8482 		retval = EINVAL;
8483 		goto end;
8484 	}
8485 
8486 	ifnet_lock_exclusive(ifp);
8487 	/* callee holds a proto refcnt upon success */
8488 	proto = find_attached_proto(ifp, proto_family);
8489 	if (proto == NULL) {
8490 		retval = ENXIO;
8491 		ifnet_lock_done(ifp);
8492 		goto end;
8493 	}
8494 
8495 	/* call family module del_proto */
8496 	if (ifp->if_del_proto) {
8497 		ifp->if_del_proto(ifp, proto->protocol_family);
8498 	}
8499 
8500 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8501 	    proto, if_proto, next_hash);
8502 
8503 	if (proto->proto_kpi == kProtoKPI_v1) {
8504 		proto->kpi.v1.input = ifproto_media_input_v1;
8505 		proto->kpi.v1.pre_output = ifproto_media_preout;
8506 		proto->kpi.v1.event = ifproto_media_event;
8507 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8508 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8509 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8510 	} else {
8511 		proto->kpi.v2.input = ifproto_media_input_v2;
8512 		proto->kpi.v2.pre_output = ifproto_media_preout;
8513 		proto->kpi.v2.event = ifproto_media_event;
8514 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8515 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8516 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8517 	}
8518 	proto->detached = 1;
8519 	ifnet_lock_done(ifp);
8520 
8521 	if (dlil_verbose) {
8522 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8523 		    (proto->proto_kpi == kProtoKPI_v1) ?
8524 		    "v1" : "v2", proto_family);
8525 	}
8526 
8527 	/* release proto refcnt held during protocol attach */
8528 	if_proto_free(proto);
8529 
8530 	/*
8531 	 * Release proto refcnt held during lookup; the rest of
8532 	 * protocol detach steps will happen when the last proto
8533 	 * reference is released.
8534 	 */
8535 	if_proto_free(proto);
8536 
8537 end:
8538 	return retval;
8539 }
8540 
8541 
8542 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8543 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8544     struct mbuf *packet, char *header)
8545 {
8546 #pragma unused(ifp, protocol, packet, header)
8547 	return ENXIO;
8548 }
8549 
8550 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8551 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8552     struct mbuf *packet)
8553 {
8554 #pragma unused(ifp, protocol, packet)
8555 	return ENXIO;
8556 }
8557 
8558 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8559 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8560     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8561     char *link_layer_dest)
8562 {
8563 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8564 	return ENXIO;
8565 }
8566 
8567 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8568 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8569     const struct kev_msg *event)
8570 {
8571 #pragma unused(ifp, protocol, event)
8572 }
8573 
8574 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8575 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8576     unsigned long command, void *argument)
8577 {
8578 #pragma unused(ifp, protocol, command, argument)
8579 	return ENXIO;
8580 }
8581 
8582 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8583 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8584     struct sockaddr_dl *out_ll, size_t ll_len)
8585 {
8586 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8587 	return ENXIO;
8588 }
8589 
8590 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8591 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8592     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8593     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8594 {
8595 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8596 	return ENXIO;
8597 }
8598 
8599 extern int if_next_index(void);
8600 extern int tcp_ecn_outbound;
8601 
8602 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8603 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8604 {
8605 	uint32_t sflags = 0;
8606 	int err;
8607 
8608 	if (if_flowadv) {
8609 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8610 	}
8611 
8612 	if (if_delaybased_queue) {
8613 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8614 	}
8615 
8616 	if (ifp->if_output_sched_model ==
8617 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8618 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8619 	}
8620 	/* Inherit drop limit from the default queue */
8621 	if (ifp->if_snd != ifcq) {
8622 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8623 	}
8624 	/* Initialize transmit queue(s) */
8625 	err = ifclassq_setup(ifcq, ifp, sflags);
8626 	if (err != 0) {
8627 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8628 		    "err=%d", __func__, ifp, err);
8629 		/* NOTREACHED */
8630 	}
8631 }
8632 
8633 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8634 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8635 {
8636 #if SKYWALK
8637 	boolean_t netif_compat;
8638 	if_nexus_netif  nexus_netif;
8639 #endif /* SKYWALK */
8640 	struct ifnet *tmp_if;
8641 	struct ifaddr *ifa;
8642 	struct if_data_internal if_data_saved;
8643 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8644 	struct dlil_threading_info *dl_inp;
8645 	thread_continue_t thfunc = NULL;
8646 	int err;
8647 
8648 	if (ifp == NULL) {
8649 		return EINVAL;
8650 	}
8651 
8652 	/*
8653 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8654 	 * prevent the interface from being configured while it is
8655 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8656 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8657 	 */
8658 	dlil_if_lock();
8659 	ifnet_head_lock_exclusive();
8660 	/* Verify we aren't already on the list */
8661 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8662 		if (tmp_if == ifp) {
8663 			ifnet_head_done();
8664 			dlil_if_unlock();
8665 			return EEXIST;
8666 		}
8667 	}
8668 
8669 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8670 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8671 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8672 		    __func__, ifp);
8673 		/* NOTREACHED */
8674 	}
8675 	lck_mtx_unlock(&ifp->if_ref_lock);
8676 
8677 	ifnet_lock_exclusive(ifp);
8678 
8679 	/* Sanity check */
8680 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8681 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8682 	VERIFY(ifp->if_threads_pending == 0);
8683 
8684 	if (ll_addr != NULL) {
8685 		if (ifp->if_addrlen == 0) {
8686 			ifp->if_addrlen = ll_addr->sdl_alen;
8687 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8688 			ifnet_lock_done(ifp);
8689 			ifnet_head_done();
8690 			dlil_if_unlock();
8691 			return EINVAL;
8692 		}
8693 	}
8694 
8695 	/*
8696 	 * Allow interfaces without protocol families to attach
8697 	 * only if they have the necessary fields filled out.
8698 	 */
8699 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8700 		DLIL_PRINTF("%s: Attempt to attach interface without "
8701 		    "family module - %d\n", __func__, ifp->if_family);
8702 		ifnet_lock_done(ifp);
8703 		ifnet_head_done();
8704 		dlil_if_unlock();
8705 		return ENODEV;
8706 	}
8707 
8708 	/* Allocate protocol hash table */
8709 	VERIFY(ifp->if_proto_hash == NULL);
8710 	ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8711 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
8712 
8713 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8714 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8715 	TAILQ_INIT(&ifp->if_flt_head);
8716 	VERIFY(ifp->if_flt_busy == 0);
8717 	VERIFY(ifp->if_flt_waiters == 0);
8718 	VERIFY(ifp->if_flt_non_os_count == 0);
8719 	VERIFY(ifp->if_flt_no_tso_count == 0);
8720 	lck_mtx_unlock(&ifp->if_flt_lock);
8721 
8722 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8723 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8724 		LIST_INIT(&ifp->if_multiaddrs);
8725 	}
8726 
8727 	VERIFY(ifp->if_allhostsinm == NULL);
8728 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8729 	TAILQ_INIT(&ifp->if_addrhead);
8730 
8731 	if (ifp->if_index == 0) {
8732 		int idx = if_next_index();
8733 
8734 		if (idx == -1) {
8735 			ifp->if_index = 0;
8736 			ifnet_lock_done(ifp);
8737 			ifnet_head_done();
8738 			dlil_if_unlock();
8739 			return ENOBUFS;
8740 		}
8741 		ifp->if_index = (uint16_t)idx;
8742 
8743 		/* the lladdr passed at attach time is the permanent address */
8744 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8745 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8746 			bcopy(CONST_LLADDR(ll_addr),
8747 			    dl_if->dl_if_permanent_ether,
8748 			    ETHER_ADDR_LEN);
8749 			dl_if->dl_if_permanent_ether_is_set = 1;
8750 		}
8751 	}
8752 	/* There should not be anything occupying this slot */
8753 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8754 
8755 	/* allocate (if needed) and initialize a link address */
8756 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8757 	if (ifa == NULL) {
8758 		ifnet_lock_done(ifp);
8759 		ifnet_head_done();
8760 		dlil_if_unlock();
8761 		return ENOBUFS;
8762 	}
8763 
8764 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8765 	ifnet_addrs[ifp->if_index - 1] = ifa;
8766 
8767 	/* make this address the first on the list */
8768 	IFA_LOCK(ifa);
8769 	/* hold a reference for ifnet_addrs[] */
8770 	IFA_ADDREF_LOCKED(ifa);
8771 	/* if_attach_link_ifa() holds a reference for ifa_link */
8772 	if_attach_link_ifa(ifp, ifa);
8773 	IFA_UNLOCK(ifa);
8774 
8775 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8776 	ifindex2ifnet[ifp->if_index] = ifp;
8777 
8778 	/* Hold a reference to the underlying dlil_ifnet */
8779 	ifnet_reference(ifp);
8780 
8781 	/* Clear stats (save and restore other fields that we care) */
8782 	if_data_saved = ifp->if_data;
8783 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8784 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8785 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8786 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8787 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8788 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8789 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8790 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8791 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8792 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8793 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8794 	ifnet_touch_lastchange(ifp);
8795 
8796 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8797 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8798 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8799 
8800 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8801 
8802 	/* Sanity checks on the input thread storage */
8803 	dl_inp = &dl_if->dl_if_inpstorage;
8804 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8805 	VERIFY(dl_inp->dlth_flags == 0);
8806 	VERIFY(dl_inp->dlth_wtot == 0);
8807 	VERIFY(dl_inp->dlth_ifp == NULL);
8808 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8809 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8810 	VERIFY(!dl_inp->dlth_affinity);
8811 	VERIFY(ifp->if_inp == NULL);
8812 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8813 	VERIFY(dl_inp->dlth_strategy == NULL);
8814 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8815 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8816 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8817 
8818 #if IFNET_INPUT_SANITY_CHK
8819 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8820 #endif /* IFNET_INPUT_SANITY_CHK */
8821 
8822 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8823 	dlil_reset_rxpoll_params(ifp);
8824 	/*
8825 	 * A specific DLIL input thread is created per non-loopback interface.
8826 	 */
8827 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8828 		ifp->if_inp = dl_inp;
8829 		ifnet_incr_pending_thread_count(ifp);
8830 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8831 		if (err == ENODEV) {
8832 			VERIFY(thfunc == NULL);
8833 			ifnet_decr_pending_thread_count(ifp);
8834 		} else if (err != 0) {
8835 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8836 			    "err=%d", __func__, ifp, err);
8837 			/* NOTREACHED */
8838 		}
8839 	}
8840 	/*
8841 	 * If the driver supports the new transmit model, calculate flow hash
8842 	 * and create a workloop starter thread to invoke the if_start callback
8843 	 * where the packets may be dequeued and transmitted.
8844 	 */
8845 	if (ifp->if_eflags & IFEF_TXSTART) {
8846 		thread_precedence_policy_data_t info;
8847 		__unused kern_return_t kret;
8848 
8849 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8850 		VERIFY(ifp->if_flowhash != 0);
8851 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8852 
8853 		ifnet_set_start_cycle(ifp, NULL);
8854 		ifp->if_start_active = 0;
8855 		ifp->if_start_req = 0;
8856 		ifp->if_start_flags = 0;
8857 		VERIFY(ifp->if_start != NULL);
8858 		ifnet_incr_pending_thread_count(ifp);
8859 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8860 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8861 			panic_plain("%s: "
8862 			    "ifp=%p couldn't get a start thread; "
8863 			    "err=%d", __func__, ifp, err);
8864 			/* NOTREACHED */
8865 		}
8866 		bzero(&info, sizeof(info));
8867 		info.importance = 1;
8868 		kret = thread_policy_set(ifp->if_start_thread,
8869 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8870 		    THREAD_PRECEDENCE_POLICY_COUNT);
8871 		ASSERT(kret == KERN_SUCCESS);
8872 	} else {
8873 		ifp->if_flowhash = 0;
8874 	}
8875 
8876 	/* Reset polling parameters */
8877 	ifnet_set_poll_cycle(ifp, NULL);
8878 	ifp->if_poll_update = 0;
8879 	ifp->if_poll_flags = 0;
8880 	ifp->if_poll_req = 0;
8881 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8882 
8883 	/*
8884 	 * If the driver supports the new receive model, create a poller
8885 	 * thread to invoke if_input_poll callback where the packets may
8886 	 * be dequeued from the driver and processed for reception.
8887 	 * if the interface is netif compat then the poller thread is
8888 	 * managed by netif.
8889 	 */
8890 	if (thfunc == dlil_rxpoll_input_thread_func) {
8891 		thread_precedence_policy_data_t info;
8892 		__unused kern_return_t kret;
8893 #if SKYWALK
8894 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8895 #endif /* SKYWALK */
8896 		VERIFY(ifp->if_input_poll != NULL);
8897 		VERIFY(ifp->if_input_ctl != NULL);
8898 		ifnet_incr_pending_thread_count(ifp);
8899 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8900 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8901 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8902 			    "err=%d", __func__, ifp, err);
8903 			/* NOTREACHED */
8904 		}
8905 		bzero(&info, sizeof(info));
8906 		info.importance = 1;
8907 		kret = thread_policy_set(ifp->if_poll_thread,
8908 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8909 		    THREAD_PRECEDENCE_POLICY_COUNT);
8910 		ASSERT(kret == KERN_SUCCESS);
8911 	}
8912 
8913 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8914 	VERIFY(ifp->if_desc.ifd_len == 0);
8915 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8916 
8917 	/* Record attach PC stacktrace */
8918 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8919 
8920 	ifp->if_updatemcasts = 0;
8921 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8922 		struct ifmultiaddr *ifma;
8923 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8924 			IFMA_LOCK(ifma);
8925 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8926 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8927 				ifp->if_updatemcasts++;
8928 			}
8929 			IFMA_UNLOCK(ifma);
8930 		}
8931 
8932 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8933 		    "membership(s)\n", if_name(ifp),
8934 		    ifp->if_updatemcasts);
8935 	}
8936 
8937 	/* Clear logging parameters */
8938 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8939 
8940 	/* Clear foreground/realtime activity timestamps */
8941 	ifp->if_fg_sendts = 0;
8942 	ifp->if_rt_sendts = 0;
8943 
8944 	/* Clear throughput estimates and radio type */
8945 	ifp->if_estimated_up_bucket = 0;
8946 	ifp->if_estimated_down_bucket = 0;
8947 	ifp->if_radio_type = 0;
8948 	ifp->if_radio_channel = 0;
8949 
8950 	VERIFY(ifp->if_delegated.ifp == NULL);
8951 	VERIFY(ifp->if_delegated.type == 0);
8952 	VERIFY(ifp->if_delegated.family == 0);
8953 	VERIFY(ifp->if_delegated.subfamily == 0);
8954 	VERIFY(ifp->if_delegated.expensive == 0);
8955 	VERIFY(ifp->if_delegated.constrained == 0);
8956 
8957 	VERIFY(ifp->if_agentids == NULL);
8958 	VERIFY(ifp->if_agentcount == 0);
8959 
8960 	/* Reset interface state */
8961 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8962 	ifp->if_interface_state.valid_bitmask |=
8963 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8964 	ifp->if_interface_state.interface_availability =
8965 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8966 
8967 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8968 	if (ifp == lo_ifp) {
8969 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8970 		ifp->if_interface_state.valid_bitmask |=
8971 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8972 	} else {
8973 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8974 	}
8975 
8976 	/*
8977 	 * Enable ECN capability on this interface depending on the
8978 	 * value of ECN global setting
8979 	 */
8980 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8981 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8982 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8983 	}
8984 
8985 	/*
8986 	 * Built-in Cyclops always on policy for WiFi infra
8987 	 */
8988 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8989 		errno_t error;
8990 
8991 		error = if_set_qosmarking_mode(ifp,
8992 		    IFRTYPE_QOSMARKING_FASTLANE);
8993 		if (error != 0) {
8994 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8995 			    __func__, ifp->if_xname, error);
8996 		} else {
8997 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8998 #if (DEVELOPMENT || DEBUG)
8999 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9000 			    __func__, ifp->if_xname);
9001 #endif /* (DEVELOPMENT || DEBUG) */
9002 		}
9003 	}
9004 
9005 	ifnet_lock_done(ifp);
9006 	ifnet_head_done();
9007 
9008 #if SKYWALK
9009 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9010 #endif /* SKYWALK */
9011 
9012 	lck_mtx_lock(&ifp->if_cached_route_lock);
9013 	/* Enable forwarding cached route */
9014 	ifp->if_fwd_cacheok = 1;
9015 	/* Clean up any existing cached routes */
9016 	ROUTE_RELEASE(&ifp->if_fwd_route);
9017 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9018 	ROUTE_RELEASE(&ifp->if_src_route);
9019 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9020 	ROUTE_RELEASE(&ifp->if_src_route6);
9021 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9022 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9023 
9024 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9025 
9026 	/*
9027 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9028 	 * and trees; do this before the ifnet is marked as attached.
9029 	 * The ifnet keeps the reference to the info structures even after
9030 	 * the ifnet is detached, since the network-layer records still
9031 	 * refer to the info structures even after that.  This also
9032 	 * makes it possible for them to still function after the ifnet
9033 	 * is recycled or reattached.
9034 	 */
9035 #if INET
9036 	if (IGMP_IFINFO(ifp) == NULL) {
9037 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9038 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9039 	} else {
9040 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9041 		igmp_domifreattach(IGMP_IFINFO(ifp));
9042 	}
9043 #endif /* INET */
9044 	if (MLD_IFINFO(ifp) == NULL) {
9045 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9046 		VERIFY(MLD_IFINFO(ifp) != NULL);
9047 	} else {
9048 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9049 		mld_domifreattach(MLD_IFINFO(ifp));
9050 	}
9051 
9052 	VERIFY(ifp->if_data_threshold == 0);
9053 	VERIFY(ifp->if_dt_tcall != NULL);
9054 
9055 	/*
9056 	 * Wait for the created kernel threads for I/O to get
9057 	 * scheduled and run at least once before we proceed
9058 	 * to mark interface as attached.
9059 	 */
9060 	lck_mtx_lock(&ifp->if_ref_lock);
9061 	while (ifp->if_threads_pending != 0) {
9062 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9063 		    "interface %s to get scheduled at least once.\n",
9064 		    __func__, ifp->if_xname);
9065 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9066 		    __func__, NULL);
9067 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9068 	}
9069 	lck_mtx_unlock(&ifp->if_ref_lock);
9070 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9071 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9072 
9073 	/* Final mark this ifnet as attached. */
9074 	ifnet_lock_exclusive(ifp);
9075 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9076 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9077 	lck_mtx_unlock(&ifp->if_ref_lock);
9078 	if (net_rtref) {
9079 		/* boot-args override; enable idle notification */
9080 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9081 		    IFRF_IDLE_NOTIFY);
9082 	} else {
9083 		/* apply previous request(s) to set the idle flags, if any */
9084 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9085 		    ifp->if_idle_new_flags_mask);
9086 	}
9087 #if SKYWALK
9088 	/* the interface is fully attached; let the nexus adapter know */
9089 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9090 		if (netif_compat) {
9091 			if (sk_netif_compat_txmodel ==
9092 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9093 				ifnet_enqueue_multi_setup(ifp,
9094 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9095 			}
9096 			ifp->if_nx_netif = nexus_netif;
9097 		}
9098 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9099 	}
9100 #endif /* SKYWALK */
9101 	ifnet_lock_done(ifp);
9102 	dlil_if_unlock();
9103 
9104 #if PF
9105 	/*
9106 	 * Attach packet filter to this interface, if enabled.
9107 	 */
9108 	pf_ifnet_hook(ifp, 1);
9109 #endif /* PF */
9110 
9111 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9112 
9113 	if (dlil_verbose) {
9114 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9115 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9116 	}
9117 
9118 	return 0;
9119 }
9120 
9121 /*
9122  * Prepare the storage for the first/permanent link address, which must
9123  * must have the same lifetime as the ifnet itself.  Although the link
9124  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9125  * its location in memory must never change as it may still be referred
9126  * to by some parts of the system afterwards (unfortunate implementation
9127  * artifacts inherited from BSD.)
9128  *
9129  * Caller must hold ifnet lock as writer.
9130  */
9131 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9132 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9133 {
9134 	struct ifaddr *ifa, *oifa;
9135 	struct sockaddr_dl *asdl, *msdl;
9136 	char workbuf[IFNAMSIZ * 2];
9137 	int namelen, masklen, socksize;
9138 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9139 
9140 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9141 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9142 
9143 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9144 	    if_name(ifp));
9145 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9146 	    + ((namelen > 0) ? namelen : 0);
9147 	socksize = masklen + ifp->if_addrlen;
9148 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9149 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9150 		socksize = sizeof(struct sockaddr_dl);
9151 	}
9152 	socksize = ROUNDUP(socksize);
9153 #undef ROUNDUP
9154 
9155 	ifa = ifp->if_lladdr;
9156 	if (socksize > DLIL_SDLMAXLEN ||
9157 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9158 		/*
9159 		 * Rare, but in the event that the link address requires
9160 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9161 		 * largest possible storages for address and mask, such
9162 		 * that we can reuse the same space when if_addrlen grows.
9163 		 * This same space will be used when if_addrlen shrinks.
9164 		 */
9165 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9166 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9167 
9168 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9169 			ifa_lock_init(ifa);
9170 			/* Don't set IFD_ALLOC, as this is permanent */
9171 			ifa->ifa_debug = IFD_LINK;
9172 		}
9173 		IFA_LOCK(ifa);
9174 		/* address and mask sockaddr_dl locations */
9175 		asdl = (struct sockaddr_dl *)(ifa + 1);
9176 		bzero(asdl, SOCK_MAXADDRLEN);
9177 		msdl = (struct sockaddr_dl *)(void *)
9178 		    ((char *)asdl + SOCK_MAXADDRLEN);
9179 		bzero(msdl, SOCK_MAXADDRLEN);
9180 	} else {
9181 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9182 		/*
9183 		 * Use the storage areas for address and mask within the
9184 		 * dlil_ifnet structure.  This is the most common case.
9185 		 */
9186 		if (ifa == NULL) {
9187 			ifa = &dl_if->dl_if_lladdr.ifa;
9188 			ifa_lock_init(ifa);
9189 			/* Don't set IFD_ALLOC, as this is permanent */
9190 			ifa->ifa_debug = IFD_LINK;
9191 		}
9192 		IFA_LOCK(ifa);
9193 		/* address and mask sockaddr_dl locations */
9194 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9195 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9196 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9197 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9198 	}
9199 
9200 	/* hold a permanent reference for the ifnet itself */
9201 	IFA_ADDREF_LOCKED(ifa);
9202 	oifa = ifp->if_lladdr;
9203 	ifp->if_lladdr = ifa;
9204 
9205 	VERIFY(ifa->ifa_debug == IFD_LINK);
9206 	ifa->ifa_ifp = ifp;
9207 	ifa->ifa_rtrequest = link_rtrequest;
9208 	ifa->ifa_addr = (struct sockaddr *)asdl;
9209 	asdl->sdl_len = (u_char)socksize;
9210 	asdl->sdl_family = AF_LINK;
9211 	if (namelen > 0) {
9212 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9213 		    sizeof(asdl->sdl_data)));
9214 		asdl->sdl_nlen = (u_char)namelen;
9215 	} else {
9216 		asdl->sdl_nlen = 0;
9217 	}
9218 	asdl->sdl_index = ifp->if_index;
9219 	asdl->sdl_type = ifp->if_type;
9220 	if (ll_addr != NULL) {
9221 		asdl->sdl_alen = ll_addr->sdl_alen;
9222 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9223 	} else {
9224 		asdl->sdl_alen = 0;
9225 	}
9226 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9227 	msdl->sdl_len = (u_char)masklen;
9228 	while (namelen > 0) {
9229 		msdl->sdl_data[--namelen] = 0xff;
9230 	}
9231 	IFA_UNLOCK(ifa);
9232 
9233 	if (oifa != NULL) {
9234 		IFA_REMREF(oifa);
9235 	}
9236 
9237 	return ifa;
9238 }
9239 
9240 static void
if_purgeaddrs(struct ifnet * ifp)9241 if_purgeaddrs(struct ifnet *ifp)
9242 {
9243 #if INET
9244 	in_purgeaddrs(ifp);
9245 #endif /* INET */
9246 	in6_purgeaddrs(ifp);
9247 }
9248 
9249 errno_t
ifnet_detach(ifnet_t ifp)9250 ifnet_detach(ifnet_t ifp)
9251 {
9252 	struct ifnet *delegated_ifp;
9253 	struct nd_ifinfo *ndi = NULL;
9254 
9255 	if (ifp == NULL) {
9256 		return EINVAL;
9257 	}
9258 
9259 	ndi = ND_IFINFO(ifp);
9260 	if (NULL != ndi) {
9261 		ndi->cga_initialized = FALSE;
9262 	}
9263 
9264 	/* Mark the interface down */
9265 	if_down(ifp);
9266 
9267 	/*
9268 	 * IMPORTANT NOTE
9269 	 *
9270 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9271 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9272 	 * until after we've waited for all I/O references to drain
9273 	 * in ifnet_detach_final().
9274 	 */
9275 
9276 	ifnet_head_lock_exclusive();
9277 	ifnet_lock_exclusive(ifp);
9278 
9279 	if (ifp->if_output_netem != NULL) {
9280 		netem_destroy(ifp->if_output_netem);
9281 		ifp->if_output_netem = NULL;
9282 	}
9283 
9284 	/*
9285 	 * Check to see if this interface has previously triggered
9286 	 * aggressive protocol draining; if so, decrement the global
9287 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9288 	 * there are no more of such an interface around.
9289 	 */
9290 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9291 
9292 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9293 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9294 		lck_mtx_unlock(&ifp->if_ref_lock);
9295 		ifnet_lock_done(ifp);
9296 		ifnet_head_done();
9297 		return EINVAL;
9298 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9299 		/* Interface has already been detached */
9300 		lck_mtx_unlock(&ifp->if_ref_lock);
9301 		ifnet_lock_done(ifp);
9302 		ifnet_head_done();
9303 		return ENXIO;
9304 	}
9305 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9306 	/* Indicate this interface is being detached */
9307 	ifp->if_refflags &= ~IFRF_ATTACHED;
9308 	ifp->if_refflags |= IFRF_DETACHING;
9309 	lck_mtx_unlock(&ifp->if_ref_lock);
9310 
9311 	if (dlil_verbose) {
9312 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9313 	}
9314 
9315 	/* clean up flow control entry object if there's any */
9316 	if (ifp->if_eflags & IFEF_TXSTART) {
9317 		ifnet_flowadv(ifp->if_flowhash);
9318 	}
9319 
9320 	/* Reset ECN enable/disable flags */
9321 	/* Reset CLAT46 flag */
9322 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9323 
9324 	/*
9325 	 * We do not reset the TCP keep alive counters in case
9326 	 * a TCP connection stays connection after the interface
9327 	 * went down
9328 	 */
9329 	if (ifp->if_tcp_kao_cnt > 0) {
9330 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9331 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9332 	}
9333 	ifp->if_tcp_kao_max = 0;
9334 
9335 	/*
9336 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9337 	 * no longer be visible during lookups from this point.
9338 	 */
9339 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9340 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9341 	ifp->if_link.tqe_next = NULL;
9342 	ifp->if_link.tqe_prev = NULL;
9343 	if (ifp->if_ordered_link.tqe_next != NULL ||
9344 	    ifp->if_ordered_link.tqe_prev != NULL) {
9345 		ifnet_remove_from_ordered_list(ifp);
9346 	}
9347 	ifindex2ifnet[ifp->if_index] = NULL;
9348 
9349 	/* 18717626 - reset router mode */
9350 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9351 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9352 
9353 	/* Record detach PC stacktrace */
9354 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9355 
9356 	/* Clear logging parameters */
9357 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9358 
9359 	/* Clear delegated interface info (reference released below) */
9360 	delegated_ifp = ifp->if_delegated.ifp;
9361 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9362 
9363 	/* Reset interface state */
9364 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9365 
9366 	ifnet_lock_done(ifp);
9367 	ifnet_head_done();
9368 
9369 	/* Release reference held on the delegated interface */
9370 	if (delegated_ifp != NULL) {
9371 		ifnet_release(delegated_ifp);
9372 	}
9373 
9374 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9375 	if (ifp != lo_ifp) {
9376 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9377 	}
9378 
9379 	/* Reset TCP local statistics */
9380 	if (ifp->if_tcp_stat != NULL) {
9381 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9382 	}
9383 
9384 	/* Reset UDP local statistics */
9385 	if (ifp->if_udp_stat != NULL) {
9386 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9387 	}
9388 
9389 	/* Reset ifnet IPv4 stats */
9390 	if (ifp->if_ipv4_stat != NULL) {
9391 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9392 	}
9393 
9394 	/* Reset ifnet IPv6 stats */
9395 	if (ifp->if_ipv6_stat != NULL) {
9396 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9397 	}
9398 
9399 	/* Release memory held for interface link status report */
9400 	if (ifp->if_link_status != NULL) {
9401 		kfree_type(struct if_link_status, ifp->if_link_status);
9402 		ifp->if_link_status = NULL;
9403 	}
9404 
9405 	/* Let BPF know we're detaching */
9406 	bpfdetach(ifp);
9407 
9408 	/* Disable forwarding cached route */
9409 	lck_mtx_lock(&ifp->if_cached_route_lock);
9410 	ifp->if_fwd_cacheok = 0;
9411 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9412 
9413 	/* Disable data threshold and wait for any pending event posting */
9414 	ifp->if_data_threshold = 0;
9415 	VERIFY(ifp->if_dt_tcall != NULL);
9416 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9417 
9418 	/*
9419 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9420 	 * references to the info structures and leave them attached to
9421 	 * this ifnet.
9422 	 */
9423 #if INET
9424 	igmp_domifdetach(ifp);
9425 #endif /* INET */
9426 	mld_domifdetach(ifp);
9427 
9428 #if SKYWALK
9429 	/* Clean up any netns tokens still pointing to to this ifnet */
9430 	netns_ifnet_detach(ifp);
9431 #endif /* SKYWALK */
9432 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9433 
9434 	/* Let worker thread take care of the rest, to avoid reentrancy */
9435 	dlil_if_lock();
9436 	ifnet_detaching_enqueue(ifp);
9437 	dlil_if_unlock();
9438 
9439 	return 0;
9440 }
9441 
9442 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9443 ifnet_detaching_enqueue(struct ifnet *ifp)
9444 {
9445 	dlil_if_lock_assert();
9446 
9447 	++ifnet_detaching_cnt;
9448 	VERIFY(ifnet_detaching_cnt != 0);
9449 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9450 	wakeup((caddr_t)&ifnet_delayed_run);
9451 }
9452 
9453 static struct ifnet *
ifnet_detaching_dequeue(void)9454 ifnet_detaching_dequeue(void)
9455 {
9456 	struct ifnet *ifp;
9457 
9458 	dlil_if_lock_assert();
9459 
9460 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9461 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9462 	if (ifp != NULL) {
9463 		VERIFY(ifnet_detaching_cnt != 0);
9464 		--ifnet_detaching_cnt;
9465 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9466 		ifp->if_detaching_link.tqe_next = NULL;
9467 		ifp->if_detaching_link.tqe_prev = NULL;
9468 	}
9469 	return ifp;
9470 }
9471 
9472 __attribute__((noreturn))
9473 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9474 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9475 {
9476 #pragma unused(v, wres)
9477 	struct ifnet *ifp;
9478 
9479 	dlil_if_lock();
9480 	if (__improbable(ifnet_detaching_embryonic)) {
9481 		ifnet_detaching_embryonic = FALSE;
9482 		/* there's no lock ordering constrain so OK to do this here */
9483 		dlil_decr_pending_thread_count();
9484 	}
9485 
9486 	for (;;) {
9487 		dlil_if_lock_assert();
9488 
9489 		if (ifnet_detaching_cnt == 0) {
9490 			break;
9491 		}
9492 
9493 		net_update_uptime();
9494 
9495 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9496 
9497 		/* Take care of detaching ifnet */
9498 		ifp = ifnet_detaching_dequeue();
9499 		if (ifp != NULL) {
9500 			dlil_if_unlock();
9501 			ifnet_detach_final(ifp);
9502 			dlil_if_lock();
9503 		}
9504 	}
9505 
9506 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9507 	dlil_if_unlock();
9508 	(void) thread_block(ifnet_detacher_thread_cont);
9509 
9510 	VERIFY(0);      /* we should never get here */
9511 	/* NOTREACHED */
9512 	__builtin_unreachable();
9513 }
9514 
9515 __dead2
9516 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9517 ifnet_detacher_thread_func(void *v, wait_result_t w)
9518 {
9519 #pragma unused(v, w)
9520 	dlil_if_lock();
9521 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9522 	ifnet_detaching_embryonic = TRUE;
9523 	/* wake up once to get out of embryonic state */
9524 	wakeup((caddr_t)&ifnet_delayed_run);
9525 	dlil_if_unlock();
9526 	(void) thread_block(ifnet_detacher_thread_cont);
9527 	VERIFY(0);
9528 	/* NOTREACHED */
9529 	__builtin_unreachable();
9530 }
9531 
9532 static void
ifnet_detach_final(struct ifnet * ifp)9533 ifnet_detach_final(struct ifnet *ifp)
9534 {
9535 	struct ifnet_filter *filter, *filter_next;
9536 	struct dlil_ifnet *dlifp;
9537 	struct ifnet_filter_head fhead;
9538 	struct dlil_threading_info *inp;
9539 	struct ifaddr *ifa;
9540 	ifnet_detached_func if_free;
9541 	int i;
9542 
9543 #if SKYWALK
9544 	dlil_netif_detach_notify(ifp);
9545 	/*
9546 	 * Wait for the datapath to quiesce before tearing down
9547 	 * netif/flowswitch nexuses.
9548 	 */
9549 	dlil_quiesce_and_detach_nexuses(ifp);
9550 #endif /* SKYWALK */
9551 
9552 	lck_mtx_lock(&ifp->if_ref_lock);
9553 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9554 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9555 		    __func__, ifp);
9556 		/* NOTREACHED */
9557 	}
9558 
9559 	/*
9560 	 * Wait until the existing IO references get released
9561 	 * before we proceed with ifnet_detach.  This is not a
9562 	 * common case, so block without using a continuation.
9563 	 */
9564 	while (ifp->if_refio > 0) {
9565 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9566 		    "to be released\n", __func__, if_name(ifp));
9567 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9568 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9569 	}
9570 
9571 	VERIFY(ifp->if_datamov == 0);
9572 	VERIFY(ifp->if_drainers == 0);
9573 	VERIFY(ifp->if_suspend == 0);
9574 	ifp->if_refflags &= ~IFRF_READY;
9575 	lck_mtx_unlock(&ifp->if_ref_lock);
9576 
9577 	/* Clear agent IDs */
9578 	if (ifp->if_agentids != NULL) {
9579 		kfree_data(ifp->if_agentids,
9580 		    sizeof(uuid_t) * ifp->if_agentcount);
9581 		ifp->if_agentids = NULL;
9582 	}
9583 	ifp->if_agentcount = 0;
9584 
9585 #if SKYWALK
9586 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9587 #endif /* SKYWALK */
9588 	/* Drain and destroy send queue */
9589 	ifclassq_teardown(ifp->if_snd);
9590 
9591 	/* Detach interface filters */
9592 	lck_mtx_lock(&ifp->if_flt_lock);
9593 	if_flt_monitor_enter(ifp);
9594 
9595 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9596 	fhead = ifp->if_flt_head;
9597 	TAILQ_INIT(&ifp->if_flt_head);
9598 
9599 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9600 		filter_next = TAILQ_NEXT(filter, filt_next);
9601 		lck_mtx_unlock(&ifp->if_flt_lock);
9602 
9603 		dlil_detach_filter_internal(filter, 1);
9604 		lck_mtx_lock(&ifp->if_flt_lock);
9605 	}
9606 	if_flt_monitor_leave(ifp);
9607 	lck_mtx_unlock(&ifp->if_flt_lock);
9608 
9609 	/* Tell upper layers to drop their network addresses */
9610 	if_purgeaddrs(ifp);
9611 
9612 	ifnet_lock_exclusive(ifp);
9613 
9614 	/* Unplumb all protocols */
9615 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9616 		struct if_proto *proto;
9617 
9618 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9619 		while (proto != NULL) {
9620 			protocol_family_t family = proto->protocol_family;
9621 			ifnet_lock_done(ifp);
9622 			proto_unplumb(family, ifp);
9623 			ifnet_lock_exclusive(ifp);
9624 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9625 		}
9626 		/* There should not be any protocols left */
9627 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9628 	}
9629 	zfree(dlif_phash_zone, ifp->if_proto_hash);
9630 	ifp->if_proto_hash = NULL;
9631 
9632 	/* Detach (permanent) link address from if_addrhead */
9633 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9634 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9635 	IFA_LOCK(ifa);
9636 	if_detach_link_ifa(ifp, ifa);
9637 	IFA_UNLOCK(ifa);
9638 
9639 	/* Remove (permanent) link address from ifnet_addrs[] */
9640 	IFA_REMREF(ifa);
9641 	ifnet_addrs[ifp->if_index - 1] = NULL;
9642 
9643 	/* This interface should not be on {ifnet_head,detaching} */
9644 	VERIFY(ifp->if_link.tqe_next == NULL);
9645 	VERIFY(ifp->if_link.tqe_prev == NULL);
9646 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9647 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9648 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9649 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9650 
9651 	/* The slot should have been emptied */
9652 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9653 
9654 	/* There should not be any addresses left */
9655 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9656 
9657 	/*
9658 	 * Signal the starter thread to terminate itself, and wait until
9659 	 * it has exited.
9660 	 */
9661 	if (ifp->if_start_thread != THREAD_NULL) {
9662 		lck_mtx_lock_spin(&ifp->if_start_lock);
9663 		ifp->if_start_flags |= IFSF_TERMINATING;
9664 		wakeup_one((caddr_t)&ifp->if_start_thread);
9665 		lck_mtx_unlock(&ifp->if_start_lock);
9666 
9667 		/* wait for starter thread to terminate */
9668 		lck_mtx_lock(&ifp->if_start_lock);
9669 		while (ifp->if_start_thread != THREAD_NULL) {
9670 			if (dlil_verbose) {
9671 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9672 				    __func__,
9673 				    if_name(ifp));
9674 			}
9675 			(void) msleep(&ifp->if_start_thread,
9676 			    &ifp->if_start_lock, (PZERO - 1),
9677 			    "ifnet_start_thread_exit", NULL);
9678 		}
9679 		lck_mtx_unlock(&ifp->if_start_lock);
9680 		if (dlil_verbose) {
9681 			DLIL_PRINTF("%s: %s starter thread termination complete",
9682 			    __func__, if_name(ifp));
9683 		}
9684 	}
9685 
9686 	/*
9687 	 * Signal the poller thread to terminate itself, and wait until
9688 	 * it has exited.
9689 	 */
9690 	if (ifp->if_poll_thread != THREAD_NULL) {
9691 #if SKYWALK
9692 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9693 #endif /* SKYWALK */
9694 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9695 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9696 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9697 		lck_mtx_unlock(&ifp->if_poll_lock);
9698 
9699 		/* wait for poller thread to terminate */
9700 		lck_mtx_lock(&ifp->if_poll_lock);
9701 		while (ifp->if_poll_thread != THREAD_NULL) {
9702 			if (dlil_verbose) {
9703 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9704 				    __func__,
9705 				    if_name(ifp));
9706 			}
9707 			(void) msleep(&ifp->if_poll_thread,
9708 			    &ifp->if_poll_lock, (PZERO - 1),
9709 			    "ifnet_poll_thread_exit", NULL);
9710 		}
9711 		lck_mtx_unlock(&ifp->if_poll_lock);
9712 		if (dlil_verbose) {
9713 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9714 			    __func__, if_name(ifp));
9715 		}
9716 	}
9717 
9718 	/*
9719 	 * If thread affinity was set for the workloop thread, we will need
9720 	 * to tear down the affinity and release the extra reference count
9721 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9722 	 * without dedicated input threads.
9723 	 */
9724 	if ((inp = ifp->if_inp) != NULL) {
9725 		VERIFY(inp != dlil_main_input_thread);
9726 
9727 		if (inp->dlth_affinity) {
9728 			struct thread *tp, *wtp, *ptp;
9729 
9730 			lck_mtx_lock_spin(&inp->dlth_lock);
9731 			wtp = inp->dlth_driver_thread;
9732 			inp->dlth_driver_thread = THREAD_NULL;
9733 			ptp = inp->dlth_poller_thread;
9734 			inp->dlth_poller_thread = THREAD_NULL;
9735 			ASSERT(inp->dlth_thread != THREAD_NULL);
9736 			tp = inp->dlth_thread;    /* don't nullify now */
9737 			inp->dlth_affinity_tag = 0;
9738 			inp->dlth_affinity = FALSE;
9739 			lck_mtx_unlock(&inp->dlth_lock);
9740 
9741 			/* Tear down poll thread affinity */
9742 			if (ptp != NULL) {
9743 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9744 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9745 				(void) dlil_affinity_set(ptp,
9746 				    THREAD_AFFINITY_TAG_NULL);
9747 				thread_deallocate(ptp);
9748 			}
9749 
9750 			/* Tear down workloop thread affinity */
9751 			if (wtp != NULL) {
9752 				(void) dlil_affinity_set(wtp,
9753 				    THREAD_AFFINITY_TAG_NULL);
9754 				thread_deallocate(wtp);
9755 			}
9756 
9757 			/* Tear down DLIL input thread affinity */
9758 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9759 			thread_deallocate(tp);
9760 		}
9761 
9762 		/* disassociate ifp DLIL input thread */
9763 		ifp->if_inp = NULL;
9764 
9765 		/* if the worker thread was created, tell it to terminate */
9766 		if (inp->dlth_thread != THREAD_NULL) {
9767 			lck_mtx_lock_spin(&inp->dlth_lock);
9768 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9769 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9770 				wakeup_one((caddr_t)&inp->dlth_flags);
9771 			}
9772 			lck_mtx_unlock(&inp->dlth_lock);
9773 			ifnet_lock_done(ifp);
9774 
9775 			/* wait for the input thread to terminate */
9776 			lck_mtx_lock_spin(&inp->dlth_lock);
9777 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9778 			    == 0) {
9779 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9780 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9781 			}
9782 			lck_mtx_unlock(&inp->dlth_lock);
9783 			ifnet_lock_exclusive(ifp);
9784 		}
9785 
9786 		/* clean-up input thread state */
9787 		dlil_clean_threading_info(inp);
9788 		/* clean-up poll parameters */
9789 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9790 		dlil_reset_rxpoll_params(ifp);
9791 	}
9792 
9793 	/* The driver might unload, so point these to ourselves */
9794 	if_free = ifp->if_free;
9795 	ifp->if_output_dlil = ifp_if_output;
9796 	ifp->if_output = ifp_if_output;
9797 	ifp->if_pre_enqueue = ifp_if_output;
9798 	ifp->if_start = ifp_if_start;
9799 	ifp->if_output_ctl = ifp_if_ctl;
9800 	ifp->if_input_dlil = ifp_if_input;
9801 	ifp->if_input_poll = ifp_if_input_poll;
9802 	ifp->if_input_ctl = ifp_if_ctl;
9803 	ifp->if_ioctl = ifp_if_ioctl;
9804 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9805 	ifp->if_free = ifp_if_free;
9806 	ifp->if_demux = ifp_if_demux;
9807 	ifp->if_event = ifp_if_event;
9808 	ifp->if_framer_legacy = ifp_if_framer;
9809 	ifp->if_framer = ifp_if_framer_extended;
9810 	ifp->if_add_proto = ifp_if_add_proto;
9811 	ifp->if_del_proto = ifp_if_del_proto;
9812 	ifp->if_check_multi = ifp_if_check_multi;
9813 
9814 	/* wipe out interface description */
9815 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9816 	ifp->if_desc.ifd_len = 0;
9817 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9818 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9819 
9820 	/* there shouldn't be any delegation by now */
9821 	VERIFY(ifp->if_delegated.ifp == NULL);
9822 	VERIFY(ifp->if_delegated.type == 0);
9823 	VERIFY(ifp->if_delegated.family == 0);
9824 	VERIFY(ifp->if_delegated.subfamily == 0);
9825 	VERIFY(ifp->if_delegated.expensive == 0);
9826 	VERIFY(ifp->if_delegated.constrained == 0);
9827 
9828 	/* QoS marking get cleared */
9829 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9830 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9831 
9832 #if SKYWALK
9833 	/* the nexus destructor is responsible for clearing these */
9834 	VERIFY(ifp->if_na_ops == NULL);
9835 	VERIFY(ifp->if_na == NULL);
9836 #endif /* SKYWALK */
9837 
9838 	/* promiscuous count needs to start at zero again */
9839 	ifp->if_pcount = 0;
9840 	ifp->if_flags &= ~IFF_PROMISC;
9841 
9842 	ifnet_lock_done(ifp);
9843 
9844 #if PF
9845 	/*
9846 	 * Detach this interface from packet filter, if enabled.
9847 	 */
9848 	pf_ifnet_hook(ifp, 0);
9849 #endif /* PF */
9850 
9851 	/* Filter list should be empty */
9852 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9853 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9854 	VERIFY(ifp->if_flt_busy == 0);
9855 	VERIFY(ifp->if_flt_waiters == 0);
9856 	VERIFY(ifp->if_flt_non_os_count == 0);
9857 	VERIFY(ifp->if_flt_no_tso_count == 0);
9858 	lck_mtx_unlock(&ifp->if_flt_lock);
9859 
9860 	/* Last chance to drain send queue */
9861 	if_qflush_snd(ifp, 0);
9862 
9863 	/* Last chance to cleanup any cached route */
9864 	lck_mtx_lock(&ifp->if_cached_route_lock);
9865 	VERIFY(!ifp->if_fwd_cacheok);
9866 	ROUTE_RELEASE(&ifp->if_fwd_route);
9867 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9868 	ROUTE_RELEASE(&ifp->if_src_route);
9869 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9870 	ROUTE_RELEASE(&ifp->if_src_route6);
9871 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9872 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9873 
9874 	VERIFY(ifp->if_data_threshold == 0);
9875 	VERIFY(ifp->if_dt_tcall != NULL);
9876 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9877 
9878 	ifnet_llreach_ifdetach(ifp);
9879 
9880 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9881 
9882 	/*
9883 	 * Finally, mark this ifnet as detached.
9884 	 */
9885 	if (dlil_verbose) {
9886 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9887 	}
9888 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9889 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9890 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9891 		    __func__, ifp);
9892 		/* NOTREACHED */
9893 	}
9894 	ifp->if_refflags &= ~IFRF_DETACHING;
9895 	lck_mtx_unlock(&ifp->if_ref_lock);
9896 	if (if_free != NULL) {
9897 		if_free(ifp);
9898 	}
9899 
9900 	ifclassq_release(&ifp->if_snd);
9901 
9902 	/* we're fully detached, clear the "in use" bit */
9903 	dlifp = (struct dlil_ifnet *)ifp;
9904 	lck_mtx_lock(&dlifp->dl_if_lock);
9905 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9906 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9907 	lck_mtx_unlock(&dlifp->dl_if_lock);
9908 
9909 	/* Release reference held during ifnet attach */
9910 	ifnet_release(ifp);
9911 }
9912 
9913 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9914 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9915 {
9916 #pragma unused(ifp)
9917 	m_freem_list(m);
9918 	return 0;
9919 }
9920 
9921 void
ifp_if_start(struct ifnet * ifp)9922 ifp_if_start(struct ifnet *ifp)
9923 {
9924 	ifnet_purge(ifp);
9925 }
9926 
9927 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9928 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9929     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9930     boolean_t poll, struct thread *tp)
9931 {
9932 #pragma unused(ifp, m_tail, s, poll, tp)
9933 	m_freem_list(m_head);
9934 	return ENXIO;
9935 }
9936 
9937 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9938 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9939     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9940 {
9941 #pragma unused(ifp, flags, max_cnt)
9942 	if (m_head != NULL) {
9943 		*m_head = NULL;
9944 	}
9945 	if (m_tail != NULL) {
9946 		*m_tail = NULL;
9947 	}
9948 	if (cnt != NULL) {
9949 		*cnt = 0;
9950 	}
9951 	if (len != NULL) {
9952 		*len = 0;
9953 	}
9954 }
9955 
9956 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9957 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9958 {
9959 #pragma unused(ifp, cmd, arglen, arg)
9960 	return EOPNOTSUPP;
9961 }
9962 
9963 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9964 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9965 {
9966 #pragma unused(ifp, fh, pf)
9967 	m_freem(m);
9968 	return EJUSTRETURN;
9969 }
9970 
9971 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9972 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9973     const struct ifnet_demux_desc *da, u_int32_t dc)
9974 {
9975 #pragma unused(ifp, pf, da, dc)
9976 	return EINVAL;
9977 }
9978 
9979 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9980 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9981 {
9982 #pragma unused(ifp, pf)
9983 	return EINVAL;
9984 }
9985 
9986 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9987 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9988 {
9989 #pragma unused(ifp, sa)
9990 	return EOPNOTSUPP;
9991 }
9992 
9993 #if !XNU_TARGET_OS_OSX
9994 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9995 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9996     const struct sockaddr *sa, const char *ll, const char *t,
9997     u_int32_t *pre, u_int32_t *post)
9998 #else /* XNU_TARGET_OS_OSX */
9999 static errno_t
10000 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10001     const struct sockaddr *sa, const char *ll, const char *t)
10002 #endif /* XNU_TARGET_OS_OSX */
10003 {
10004 #pragma unused(ifp, m, sa, ll, t)
10005 #if !XNU_TARGET_OS_OSX
10006 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10007 #else /* XNU_TARGET_OS_OSX */
10008 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10009 #endif /* XNU_TARGET_OS_OSX */
10010 }
10011 
10012 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10013 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10014     const struct sockaddr *sa, const char *ll, const char *t,
10015     u_int32_t *pre, u_int32_t *post)
10016 {
10017 #pragma unused(ifp, sa, ll, t)
10018 	m_freem(*m);
10019 	*m = NULL;
10020 
10021 	if (pre != NULL) {
10022 		*pre = 0;
10023 	}
10024 	if (post != NULL) {
10025 		*post = 0;
10026 	}
10027 
10028 	return EJUSTRETURN;
10029 }
10030 
10031 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10032 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10033 {
10034 #pragma unused(ifp, cmd, arg)
10035 	return EOPNOTSUPP;
10036 }
10037 
10038 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10039 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10040 {
10041 #pragma unused(ifp, tm, f)
10042 	/* XXX not sure what to do here */
10043 	return 0;
10044 }
10045 
10046 static void
ifp_if_free(struct ifnet * ifp)10047 ifp_if_free(struct ifnet *ifp)
10048 {
10049 #pragma unused(ifp)
10050 }
10051 
10052 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10053 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10054 {
10055 #pragma unused(ifp, e)
10056 }
10057 
10058 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10059 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10060     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10061 {
10062 	struct ifnet *ifp1 = NULL;
10063 	struct dlil_ifnet *dlifp1 = NULL;
10064 	struct dlil_ifnet *dlifp1_saved = NULL;
10065 	void *buf, *base, **pbuf;
10066 	int ret = 0;
10067 
10068 	VERIFY(*ifp == NULL);
10069 	dlil_if_lock();
10070 	/*
10071 	 * We absolutely can't have an interface with the same name
10072 	 * in in-use state.
10073 	 * To make sure of that list has to be traversed completely
10074 	 */
10075 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10076 		ifp1 = (struct ifnet *)dlifp1;
10077 
10078 		if (ifp1->if_family != family) {
10079 			continue;
10080 		}
10081 
10082 		/*
10083 		 * If interface is in use, return EBUSY if either unique id
10084 		 * or interface extended names are the same
10085 		 */
10086 		lck_mtx_lock(&dlifp1->dl_if_lock);
10087 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10088 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10089 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10090 			ret = EBUSY;
10091 			goto end;
10092 		}
10093 
10094 		if (uniqueid_len != 0 &&
10095 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10096 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10097 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10098 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10099 				ret = EBUSY;
10100 				goto end;
10101 			}
10102 			if (dlifp1_saved == NULL) {
10103 				/* cache the first match */
10104 				dlifp1_saved = dlifp1;
10105 			}
10106 			/*
10107 			 * Do not break or jump to end as we have to traverse
10108 			 * the whole list to ensure there are no name collisions
10109 			 */
10110 		}
10111 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10112 	}
10113 
10114 	/* If there's an interface that can be recycled, use that */
10115 	if (dlifp1_saved != NULL) {
10116 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10117 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10118 			/* some other thread got in ahead of us */
10119 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10120 			ret = EBUSY;
10121 			goto end;
10122 		}
10123 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10124 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10125 		*ifp = (struct ifnet *)dlifp1_saved;
10126 		dlil_if_ref(*ifp);
10127 		goto end;
10128 	}
10129 
10130 	/* no interface found, allocate a new one */
10131 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10132 
10133 	/* Get the 64-bit aligned base address for this object */
10134 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10135 	    sizeof(u_int64_t));
10136 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10137 
10138 	/*
10139 	 * Wind back a pointer size from the aligned base and
10140 	 * save the original address so we can free it later.
10141 	 */
10142 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10143 	*pbuf = buf;
10144 	dlifp1 = base;
10145 
10146 	if (uniqueid_len) {
10147 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10148 		    Z_WAITOK);
10149 		if (dlifp1->dl_if_uniqueid == NULL) {
10150 			zfree(dlif_zone, buf);
10151 			ret = ENOMEM;
10152 			goto end;
10153 		}
10154 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10155 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10156 	}
10157 
10158 	ifp1 = (struct ifnet *)dlifp1;
10159 	dlifp1->dl_if_flags = DLIF_INUSE;
10160 	if (ifnet_debug) {
10161 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10162 		dlifp1->dl_if_trace = dlil_if_trace;
10163 	}
10164 	ifp1->if_name = dlifp1->dl_if_namestorage;
10165 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10166 
10167 	/* initialize interface description */
10168 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10169 	ifp1->if_desc.ifd_len = 0;
10170 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10171 
10172 #if SKYWALK
10173 	SLIST_INIT(&ifp1->if_netns_tokens);
10174 #endif /* SKYWALK */
10175 
10176 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10177 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10178 		    "error: %d\n", __func__, ret);
10179 		/* This probably shouldn't be fatal */
10180 		ret = 0;
10181 	}
10182 
10183 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10184 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10185 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10186 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10187 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10188 	    &ifnet_lock_attr);
10189 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10190 #if INET
10191 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10192 	    &ifnet_lock_attr);
10193 	ifp1->if_inetdata = NULL;
10194 #endif
10195 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10196 	ifp1->if_inet6_ioctl_busy = FALSE;
10197 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10198 	    &ifnet_lock_attr);
10199 	ifp1->if_inet6data = NULL;
10200 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10201 	    &ifnet_lock_attr);
10202 	ifp1->if_link_status = NULL;
10203 
10204 	/* for send data paths */
10205 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10206 	    &ifnet_lock_attr);
10207 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10208 	    &ifnet_lock_attr);
10209 
10210 	/* for receive data paths */
10211 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10212 	    &ifnet_lock_attr);
10213 
10214 	/* thread call allocation is done with sleeping zalloc */
10215 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10216 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10217 	if (ifp1->if_dt_tcall == NULL) {
10218 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10219 		/* NOTREACHED */
10220 	}
10221 
10222 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10223 
10224 	*ifp = ifp1;
10225 	dlil_if_ref(*ifp);
10226 
10227 end:
10228 	dlil_if_unlock();
10229 
10230 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10231 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10232 
10233 	return ret;
10234 }
10235 
10236 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10237 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10238 {
10239 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10240 
10241 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10242 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10243 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10244 	}
10245 
10246 	ifnet_lock_exclusive(ifp);
10247 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10248 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10249 		ifp->if_broadcast.length = 0;
10250 		ifp->if_broadcast.u.ptr = NULL;
10251 	}
10252 	lck_mtx_lock(&dlifp->dl_if_lock);
10253 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10254 	ifp->if_name = dlifp->dl_if_namestorage;
10255 	/* Reset external name (name + unit) */
10256 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10257 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10258 	    "%s?", ifp->if_name);
10259 	if (clear_in_use) {
10260 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10261 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10262 	}
10263 	lck_mtx_unlock(&dlifp->dl_if_lock);
10264 	ifnet_lock_done(ifp);
10265 }
10266 
10267 __private_extern__ void
dlil_if_release(ifnet_t ifp)10268 dlil_if_release(ifnet_t ifp)
10269 {
10270 	_dlil_if_release(ifp, false);
10271 }
10272 
10273 __private_extern__ void
dlil_if_lock(void)10274 dlil_if_lock(void)
10275 {
10276 	lck_mtx_lock(&dlil_ifnet_lock);
10277 }
10278 
10279 __private_extern__ void
dlil_if_unlock(void)10280 dlil_if_unlock(void)
10281 {
10282 	lck_mtx_unlock(&dlil_ifnet_lock);
10283 }
10284 
10285 __private_extern__ void
dlil_if_lock_assert(void)10286 dlil_if_lock_assert(void)
10287 {
10288 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10289 }
10290 
10291 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10292 dlil_proto_unplumb_all(struct ifnet *ifp)
10293 {
10294 	/*
10295 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10296 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10297 	 * explicit unplumb.
10298 	 *
10299 	 * if_proto_hash[3] is for other protocols; we expect anything
10300 	 * in this bucket to respond to the DETACHING event (which would
10301 	 * have happened by now) and do the unplumb then.
10302 	 */
10303 	(void) proto_unplumb(PF_INET, ifp);
10304 	(void) proto_unplumb(PF_INET6, ifp);
10305 }
10306 
10307 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10308 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10309 {
10310 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10311 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10312 
10313 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10314 
10315 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10316 }
10317 
10318 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10319 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10320 {
10321 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10322 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10323 
10324 	if (ifp->if_fwd_cacheok) {
10325 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10326 	} else {
10327 		ROUTE_RELEASE(src);
10328 	}
10329 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10330 }
10331 
10332 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10333 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10334 {
10335 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10336 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10337 
10338 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10339 	    sizeof(*dst));
10340 
10341 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10342 }
10343 
10344 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10345 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10346 {
10347 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10348 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10349 
10350 	if (ifp->if_fwd_cacheok) {
10351 		route_copyin((struct route *)src,
10352 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10353 	} else {
10354 		ROUTE_RELEASE(src);
10355 	}
10356 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10357 }
10358 
10359 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10360 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10361 {
10362 	struct route            src_rt;
10363 	struct sockaddr_in      *dst;
10364 
10365 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10366 
10367 	ifp_src_route_copyout(ifp, &src_rt);
10368 
10369 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10370 		ROUTE_RELEASE(&src_rt);
10371 		if (dst->sin_family != AF_INET) {
10372 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10373 			dst->sin_len = sizeof(src_rt.ro_dst);
10374 			dst->sin_family = AF_INET;
10375 		}
10376 		dst->sin_addr = src_ip;
10377 
10378 		VERIFY(src_rt.ro_rt == NULL);
10379 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10380 		    0, 0, ifp->if_index);
10381 
10382 		if (src_rt.ro_rt != NULL) {
10383 			/* retain a ref, copyin consumes one */
10384 			struct rtentry  *rte = src_rt.ro_rt;
10385 			RT_ADDREF(rte);
10386 			ifp_src_route_copyin(ifp, &src_rt);
10387 			src_rt.ro_rt = rte;
10388 		}
10389 	}
10390 
10391 	return src_rt.ro_rt;
10392 }
10393 
10394 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10395 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10396 {
10397 	struct route_in6 src_rt;
10398 
10399 	ifp_src_route6_copyout(ifp, &src_rt);
10400 
10401 	if (ROUTE_UNUSABLE(&src_rt) ||
10402 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10403 		ROUTE_RELEASE(&src_rt);
10404 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10405 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10406 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10407 			src_rt.ro_dst.sin6_family = AF_INET6;
10408 		}
10409 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10410 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10411 		    sizeof(src_rt.ro_dst.sin6_addr));
10412 
10413 		if (src_rt.ro_rt == NULL) {
10414 			src_rt.ro_rt = rtalloc1_scoped(
10415 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10416 				ifp->if_index);
10417 
10418 			if (src_rt.ro_rt != NULL) {
10419 				/* retain a ref, copyin consumes one */
10420 				struct rtentry  *rte = src_rt.ro_rt;
10421 				RT_ADDREF(rte);
10422 				ifp_src_route6_copyin(ifp, &src_rt);
10423 				src_rt.ro_rt = rte;
10424 			}
10425 		}
10426 	}
10427 
10428 	return src_rt.ro_rt;
10429 }
10430 
10431 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10432 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10433 {
10434 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10435 
10436 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10437 
10438 	/* Normalize to edge */
10439 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10440 		lqm = IFNET_LQM_THRESH_ABORT;
10441 		atomic_bitset_32(&tcbinfo.ipi_flags,
10442 		    INPCBINFO_HANDLE_LQM_ABORT);
10443 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10444 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10445 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10446 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10447 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10448 	    lqm <= IFNET_LQM_THRESH_POOR) {
10449 		lqm = IFNET_LQM_THRESH_POOR;
10450 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10451 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10452 		lqm = IFNET_LQM_THRESH_GOOD;
10453 	}
10454 
10455 	/*
10456 	 * Take the lock if needed
10457 	 */
10458 	if (!locked) {
10459 		ifnet_lock_exclusive(ifp);
10460 	}
10461 
10462 	if (lqm == ifp->if_interface_state.lqm_state &&
10463 	    (ifp->if_interface_state.valid_bitmask &
10464 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10465 		/*
10466 		 * Release the lock if was not held by the caller
10467 		 */
10468 		if (!locked) {
10469 			ifnet_lock_done(ifp);
10470 		}
10471 		return;         /* nothing to update */
10472 	}
10473 	ifp->if_interface_state.valid_bitmask |=
10474 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10475 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10476 
10477 	/*
10478 	 * Don't want to hold the lock when issuing kernel events
10479 	 */
10480 	ifnet_lock_done(ifp);
10481 
10482 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10483 	ev_lqm_data.link_quality_metric = lqm;
10484 
10485 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10486 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10487 
10488 	/*
10489 	 * Reacquire the lock for the caller
10490 	 */
10491 	if (locked) {
10492 		ifnet_lock_exclusive(ifp);
10493 	}
10494 }
10495 
10496 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10497 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10498 {
10499 	struct kev_dl_rrc_state kev;
10500 
10501 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10502 	    (ifp->if_interface_state.valid_bitmask &
10503 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10504 		return;
10505 	}
10506 
10507 	ifp->if_interface_state.valid_bitmask |=
10508 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10509 
10510 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10511 
10512 	/*
10513 	 * Don't want to hold the lock when issuing kernel events
10514 	 */
10515 	ifnet_lock_done(ifp);
10516 
10517 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10518 	kev.rrc_state = rrc_state;
10519 
10520 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10521 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10522 
10523 	ifnet_lock_exclusive(ifp);
10524 }
10525 
10526 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10527 if_state_update(struct ifnet *ifp,
10528     struct if_interface_state *if_interface_state)
10529 {
10530 	u_short if_index_available = 0;
10531 
10532 	ifnet_lock_exclusive(ifp);
10533 
10534 	if ((ifp->if_type != IFT_CELLULAR) &&
10535 	    (if_interface_state->valid_bitmask &
10536 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10537 		ifnet_lock_done(ifp);
10538 		return ENOTSUP;
10539 	}
10540 	if ((if_interface_state->valid_bitmask &
10541 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10542 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10543 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10544 		ifnet_lock_done(ifp);
10545 		return EINVAL;
10546 	}
10547 	if ((if_interface_state->valid_bitmask &
10548 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10549 	    if_interface_state->rrc_state !=
10550 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10551 	    if_interface_state->rrc_state !=
10552 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10553 		ifnet_lock_done(ifp);
10554 		return EINVAL;
10555 	}
10556 
10557 	if (if_interface_state->valid_bitmask &
10558 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10559 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10560 	}
10561 	if (if_interface_state->valid_bitmask &
10562 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10563 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10564 	}
10565 	if (if_interface_state->valid_bitmask &
10566 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10567 		ifp->if_interface_state.valid_bitmask |=
10568 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10569 		ifp->if_interface_state.interface_availability =
10570 		    if_interface_state->interface_availability;
10571 
10572 		if (ifp->if_interface_state.interface_availability ==
10573 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10574 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10575 			    __func__, if_name(ifp), ifp->if_index);
10576 			if_index_available = ifp->if_index;
10577 		} else {
10578 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10579 			    __func__, if_name(ifp), ifp->if_index);
10580 		}
10581 	}
10582 	ifnet_lock_done(ifp);
10583 
10584 	/*
10585 	 * Check if the TCP connections going on this interface should be
10586 	 * forced to send probe packets instead of waiting for TCP timers
10587 	 * to fire. This is done on an explicit notification such as
10588 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10589 	 */
10590 	if (if_index_available > 0) {
10591 		tcp_interface_send_probe(if_index_available);
10592 	}
10593 
10594 	return 0;
10595 }
10596 
10597 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10598 if_get_state(struct ifnet *ifp,
10599     struct if_interface_state *if_interface_state)
10600 {
10601 	ifnet_lock_shared(ifp);
10602 
10603 	if_interface_state->valid_bitmask = 0;
10604 
10605 	if (ifp->if_interface_state.valid_bitmask &
10606 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10607 		if_interface_state->valid_bitmask |=
10608 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10609 		if_interface_state->rrc_state =
10610 		    ifp->if_interface_state.rrc_state;
10611 	}
10612 	if (ifp->if_interface_state.valid_bitmask &
10613 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10614 		if_interface_state->valid_bitmask |=
10615 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10616 		if_interface_state->lqm_state =
10617 		    ifp->if_interface_state.lqm_state;
10618 	}
10619 	if (ifp->if_interface_state.valid_bitmask &
10620 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10621 		if_interface_state->valid_bitmask |=
10622 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10623 		if_interface_state->interface_availability =
10624 		    ifp->if_interface_state.interface_availability;
10625 	}
10626 
10627 	ifnet_lock_done(ifp);
10628 }
10629 
10630 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10631 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10632 {
10633 	if (conn_probe > 1) {
10634 		return EINVAL;
10635 	}
10636 	if (conn_probe == 0) {
10637 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10638 	} else {
10639 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10640 	}
10641 
10642 #if NECP
10643 	necp_update_all_clients();
10644 #endif /* NECP */
10645 
10646 	tcp_probe_connectivity(ifp, conn_probe);
10647 	return 0;
10648 }
10649 
10650 /* for uuid.c */
10651 static int
get_ether_index(int * ret_other_index)10652 get_ether_index(int * ret_other_index)
10653 {
10654 	struct ifnet *ifp;
10655 	int en0_index = 0;
10656 	int other_en_index = 0;
10657 	int any_ether_index = 0;
10658 	short best_unit = 0;
10659 
10660 	*ret_other_index = 0;
10661 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10662 		/*
10663 		 * find en0, or if not en0, the lowest unit en*, and if not
10664 		 * that, any ethernet
10665 		 */
10666 		ifnet_lock_shared(ifp);
10667 		if (strcmp(ifp->if_name, "en") == 0) {
10668 			if (ifp->if_unit == 0) {
10669 				/* found en0, we're done */
10670 				en0_index = ifp->if_index;
10671 				ifnet_lock_done(ifp);
10672 				break;
10673 			}
10674 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10675 				other_en_index = ifp->if_index;
10676 				best_unit = ifp->if_unit;
10677 			}
10678 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10679 			any_ether_index = ifp->if_index;
10680 		}
10681 		ifnet_lock_done(ifp);
10682 	}
10683 	if (en0_index == 0) {
10684 		if (other_en_index != 0) {
10685 			*ret_other_index = other_en_index;
10686 		} else if (any_ether_index != 0) {
10687 			*ret_other_index = any_ether_index;
10688 		}
10689 	}
10690 	return en0_index;
10691 }
10692 
10693 int
uuid_get_ethernet(u_int8_t * node)10694 uuid_get_ethernet(u_int8_t *node)
10695 {
10696 	static int en0_index;
10697 	struct ifnet *ifp;
10698 	int other_index = 0;
10699 	int the_index = 0;
10700 	int ret;
10701 
10702 	ifnet_head_lock_shared();
10703 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10704 		en0_index = get_ether_index(&other_index);
10705 	}
10706 	if (en0_index != 0) {
10707 		the_index = en0_index;
10708 	} else if (other_index != 0) {
10709 		the_index = other_index;
10710 	}
10711 	if (the_index != 0) {
10712 		struct dlil_ifnet *dl_if;
10713 
10714 		ifp = ifindex2ifnet[the_index];
10715 		VERIFY(ifp != NULL);
10716 		dl_if = (struct dlil_ifnet *)ifp;
10717 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10718 			/*
10719 			 * Use the permanent ethernet address if it is
10720 			 * available because it will never change.
10721 			 */
10722 			memcpy(node, dl_if->dl_if_permanent_ether,
10723 			    ETHER_ADDR_LEN);
10724 		} else {
10725 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10726 		}
10727 		ret = 0;
10728 	} else {
10729 		ret = -1;
10730 	}
10731 	ifnet_head_done();
10732 	return ret;
10733 }
10734 
10735 static int
10736 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10737 {
10738 #pragma unused(arg1, arg2)
10739 	uint32_t i;
10740 	int err;
10741 
10742 	i = if_rxpoll;
10743 
10744 	err = sysctl_handle_int(oidp, &i, 0, req);
10745 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10746 		return err;
10747 	}
10748 
10749 	if (net_rxpoll == 0) {
10750 		return ENXIO;
10751 	}
10752 
10753 	if_rxpoll = i;
10754 	return err;
10755 }
10756 
10757 static int
10758 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10759 {
10760 #pragma unused(arg1, arg2)
10761 	uint64_t q;
10762 	int err;
10763 
10764 	q = if_rxpoll_mode_holdtime;
10765 
10766 	err = sysctl_handle_quad(oidp, &q, 0, req);
10767 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10768 		return err;
10769 	}
10770 
10771 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10772 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10773 	}
10774 
10775 	if_rxpoll_mode_holdtime = q;
10776 
10777 	return err;
10778 }
10779 
10780 static int
10781 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10782 {
10783 #pragma unused(arg1, arg2)
10784 	uint64_t q;
10785 	int err;
10786 
10787 	q = if_rxpoll_sample_holdtime;
10788 
10789 	err = sysctl_handle_quad(oidp, &q, 0, req);
10790 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10791 		return err;
10792 	}
10793 
10794 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10795 		q = IF_RXPOLL_SAMPLETIME_MIN;
10796 	}
10797 
10798 	if_rxpoll_sample_holdtime = q;
10799 
10800 	return err;
10801 }
10802 
10803 static int
10804 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10805 {
10806 #pragma unused(arg1, arg2)
10807 	uint64_t q;
10808 	int err;
10809 
10810 	q = if_rxpoll_interval_time;
10811 
10812 	err = sysctl_handle_quad(oidp, &q, 0, req);
10813 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10814 		return err;
10815 	}
10816 
10817 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10818 		q = IF_RXPOLL_INTERVALTIME_MIN;
10819 	}
10820 
10821 	if_rxpoll_interval_time = q;
10822 
10823 	return err;
10824 }
10825 
10826 static int
10827 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10828 {
10829 #pragma unused(arg1, arg2)
10830 	uint32_t i;
10831 	int err;
10832 
10833 	i = if_sysctl_rxpoll_wlowat;
10834 
10835 	err = sysctl_handle_int(oidp, &i, 0, req);
10836 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10837 		return err;
10838 	}
10839 
10840 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10841 		return EINVAL;
10842 	}
10843 
10844 	if_sysctl_rxpoll_wlowat = i;
10845 	return err;
10846 }
10847 
10848 static int
10849 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10850 {
10851 #pragma unused(arg1, arg2)
10852 	uint32_t i;
10853 	int err;
10854 
10855 	i = if_sysctl_rxpoll_whiwat;
10856 
10857 	err = sysctl_handle_int(oidp, &i, 0, req);
10858 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10859 		return err;
10860 	}
10861 
10862 	if (i <= if_sysctl_rxpoll_wlowat) {
10863 		return EINVAL;
10864 	}
10865 
10866 	if_sysctl_rxpoll_whiwat = i;
10867 	return err;
10868 }
10869 
10870 static int
10871 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10872 {
10873 #pragma unused(arg1, arg2)
10874 	int i, err;
10875 
10876 	i = if_sndq_maxlen;
10877 
10878 	err = sysctl_handle_int(oidp, &i, 0, req);
10879 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10880 		return err;
10881 	}
10882 
10883 	if (i < IF_SNDQ_MINLEN) {
10884 		i = IF_SNDQ_MINLEN;
10885 	}
10886 
10887 	if_sndq_maxlen = i;
10888 	return err;
10889 }
10890 
10891 static int
10892 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10893 {
10894 #pragma unused(arg1, arg2)
10895 	int i, err;
10896 
10897 	i = if_rcvq_maxlen;
10898 
10899 	err = sysctl_handle_int(oidp, &i, 0, req);
10900 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10901 		return err;
10902 	}
10903 
10904 	if (i < IF_RCVQ_MINLEN) {
10905 		i = IF_RCVQ_MINLEN;
10906 	}
10907 
10908 	if_rcvq_maxlen = i;
10909 	return err;
10910 }
10911 
10912 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10913 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10914     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10915 {
10916 	struct kev_dl_node_presence kev;
10917 	struct sockaddr_dl *sdl;
10918 	struct sockaddr_in6 *sin6;
10919 	int ret = 0;
10920 
10921 	VERIFY(ifp);
10922 	VERIFY(sa);
10923 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10924 
10925 	bzero(&kev, sizeof(kev));
10926 	sin6 = &kev.sin6_node_address;
10927 	sdl = &kev.sdl_node_address;
10928 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10929 	kev.rssi = rssi;
10930 	kev.link_quality_metric = lqm;
10931 	kev.node_proximity_metric = npm;
10932 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10933 
10934 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10935 	if (ret == 0 || ret == EEXIST) {
10936 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10937 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10938 		if (err != 0) {
10939 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10940 			    "error %d\n", __func__, err);
10941 		}
10942 	}
10943 
10944 	if (ret == EEXIST) {
10945 		ret = 0;
10946 	}
10947 	return ret;
10948 }
10949 
10950 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10951 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10952 {
10953 	struct kev_dl_node_absence kev = {};
10954 	struct sockaddr_in6 *kev_sin6 = NULL;
10955 	struct sockaddr_dl *kev_sdl = NULL;
10956 	int error = 0;
10957 
10958 	VERIFY(ifp != NULL);
10959 	VERIFY(sa != NULL);
10960 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10961 
10962 	kev_sin6 = &kev.sin6_node_address;
10963 	kev_sdl = &kev.sdl_node_address;
10964 
10965 	if (sa->sa_family == AF_INET6) {
10966 		/*
10967 		 * If IPv6 address is given, get the link layer
10968 		 * address from what was cached in the neighbor cache
10969 		 */
10970 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10971 		bcopy(sa, kev_sin6, sa->sa_len);
10972 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10973 	} else {
10974 		/*
10975 		 * If passed address is AF_LINK type, derive the address
10976 		 * based on the link address.
10977 		 */
10978 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10979 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10980 	}
10981 
10982 	if (error == 0) {
10983 		kev_sdl->sdl_type = ifp->if_type;
10984 		kev_sdl->sdl_index = ifp->if_index;
10985 
10986 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10987 		    &kev.link_data, sizeof(kev), FALSE);
10988 	}
10989 }
10990 
10991 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10992 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10993     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10994 {
10995 	struct kev_dl_node_presence kev = {};
10996 	struct sockaddr_dl *kev_sdl = NULL;
10997 	struct sockaddr_in6 *kev_sin6 = NULL;
10998 	int ret = 0;
10999 
11000 	VERIFY(ifp != NULL);
11001 	VERIFY(sa != NULL && sdl != NULL);
11002 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11003 
11004 	kev_sin6 = &kev.sin6_node_address;
11005 	kev_sdl = &kev.sdl_node_address;
11006 
11007 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11008 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11009 	kev_sdl->sdl_type = ifp->if_type;
11010 	kev_sdl->sdl_index = ifp->if_index;
11011 
11012 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11013 	bcopy(sa, kev_sin6, sa->sa_len);
11014 
11015 	kev.rssi = rssi;
11016 	kev.link_quality_metric = lqm;
11017 	kev.node_proximity_metric = npm;
11018 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11019 
11020 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11021 	if (ret == 0 || ret == EEXIST) {
11022 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11023 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11024 		if (err != 0) {
11025 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11026 		}
11027 	}
11028 
11029 	if (ret == EEXIST) {
11030 		ret = 0;
11031 	}
11032 	return ret;
11033 }
11034 
11035 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11036 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11037     kauth_cred_t *credp)
11038 {
11039 	const u_int8_t *bytes;
11040 	size_t size;
11041 
11042 	bytes = CONST_LLADDR(sdl);
11043 	size = sdl->sdl_alen;
11044 
11045 #if CONFIG_MACF
11046 	if (dlil_lladdr_ckreq) {
11047 		switch (sdl->sdl_type) {
11048 		case IFT_ETHER:
11049 		case IFT_IEEE1394:
11050 			break;
11051 		default:
11052 			credp = NULL;
11053 			break;
11054 		}
11055 		;
11056 
11057 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11058 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11059 				[0] = 2
11060 			};
11061 
11062 			bytes = unspec;
11063 		}
11064 	}
11065 #else
11066 #pragma unused(credp)
11067 #endif
11068 
11069 	if (sizep != NULL) {
11070 		*sizep = size;
11071 	}
11072 	return bytes;
11073 }
11074 
11075 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11076 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11077     u_int8_t info[DLIL_MODARGLEN])
11078 {
11079 	struct kev_dl_issues kev;
11080 	struct timeval tv;
11081 
11082 	VERIFY(ifp != NULL);
11083 	VERIFY(modid != NULL);
11084 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11085 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11086 
11087 	bzero(&kev, sizeof(kev));
11088 
11089 	microtime(&tv);
11090 	kev.timestamp = tv.tv_sec;
11091 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11092 	if (info != NULL) {
11093 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11094 	}
11095 
11096 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11097 	    &kev.link_data, sizeof(kev), FALSE);
11098 }
11099 
11100 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11101 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11102     struct proc *p)
11103 {
11104 	u_int32_t level = IFNET_THROTTLE_OFF;
11105 	errno_t result = 0;
11106 
11107 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11108 
11109 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11110 		/*
11111 		 * XXX: Use priv_check_cred() instead of root check?
11112 		 */
11113 		if ((result = proc_suser(p)) != 0) {
11114 			return result;
11115 		}
11116 
11117 		if (ifr->ifr_opportunistic.ifo_flags ==
11118 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11119 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11120 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11121 			level = IFNET_THROTTLE_OFF;
11122 		} else {
11123 			result = EINVAL;
11124 		}
11125 
11126 		if (result == 0) {
11127 			result = ifnet_set_throttle(ifp, level);
11128 		}
11129 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11130 		ifr->ifr_opportunistic.ifo_flags = 0;
11131 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11132 			ifr->ifr_opportunistic.ifo_flags |=
11133 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11134 		}
11135 	}
11136 
11137 	/*
11138 	 * Return the count of current opportunistic connections
11139 	 * over the interface.
11140 	 */
11141 	if (result == 0) {
11142 		uint32_t flags = 0;
11143 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11144 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11145 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11146 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11147 		ifr->ifr_opportunistic.ifo_inuse =
11148 		    udp_count_opportunistic(ifp->if_index, flags) +
11149 		    tcp_count_opportunistic(ifp->if_index, flags);
11150 	}
11151 
11152 	if (result == EALREADY) {
11153 		result = 0;
11154 	}
11155 
11156 	return result;
11157 }
11158 
11159 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11160 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11161 {
11162 	struct ifclassq *ifq;
11163 	int err = 0;
11164 
11165 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11166 		return ENXIO;
11167 	}
11168 
11169 	*level = IFNET_THROTTLE_OFF;
11170 
11171 	ifq = ifp->if_snd;
11172 	IFCQ_LOCK(ifq);
11173 	/* Throttling works only for IFCQ, not ALTQ instances */
11174 	if (IFCQ_IS_ENABLED(ifq)) {
11175 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11176 
11177 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11178 		*level = req.level;
11179 	}
11180 	IFCQ_UNLOCK(ifq);
11181 
11182 	return err;
11183 }
11184 
11185 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11186 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11187 {
11188 	struct ifclassq *ifq;
11189 	int err = 0;
11190 
11191 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11192 		return ENXIO;
11193 	}
11194 
11195 	ifq = ifp->if_snd;
11196 
11197 	switch (level) {
11198 	case IFNET_THROTTLE_OFF:
11199 	case IFNET_THROTTLE_OPPORTUNISTIC:
11200 		break;
11201 	default:
11202 		return EINVAL;
11203 	}
11204 
11205 	IFCQ_LOCK(ifq);
11206 	if (IFCQ_IS_ENABLED(ifq)) {
11207 		cqrq_throttle_t req = { 1, level };
11208 
11209 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11210 	}
11211 	IFCQ_UNLOCK(ifq);
11212 
11213 	if (err == 0) {
11214 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11215 		    level);
11216 #if NECP
11217 		necp_update_all_clients();
11218 #endif /* NECP */
11219 		if (level == IFNET_THROTTLE_OFF) {
11220 			ifnet_start(ifp);
11221 		}
11222 	}
11223 
11224 	return err;
11225 }
11226 
11227 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11228 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11229     struct proc *p)
11230 {
11231 #pragma unused(p)
11232 	errno_t result = 0;
11233 	uint32_t flags;
11234 	int level, category, subcategory;
11235 
11236 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11237 
11238 	if (cmd == SIOCSIFLOG) {
11239 		if ((result = priv_check_cred(kauth_cred_get(),
11240 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11241 			return result;
11242 		}
11243 
11244 		level = ifr->ifr_log.ifl_level;
11245 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11246 			result = EINVAL;
11247 		}
11248 
11249 		flags = ifr->ifr_log.ifl_flags;
11250 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11251 			result = EINVAL;
11252 		}
11253 
11254 		category = ifr->ifr_log.ifl_category;
11255 		subcategory = ifr->ifr_log.ifl_subcategory;
11256 
11257 		if (result == 0) {
11258 			result = ifnet_set_log(ifp, level, flags,
11259 			    category, subcategory);
11260 		}
11261 	} else {
11262 		result = ifnet_get_log(ifp, &level, &flags, &category,
11263 		    &subcategory);
11264 		if (result == 0) {
11265 			ifr->ifr_log.ifl_level = level;
11266 			ifr->ifr_log.ifl_flags = flags;
11267 			ifr->ifr_log.ifl_category = category;
11268 			ifr->ifr_log.ifl_subcategory = subcategory;
11269 		}
11270 	}
11271 
11272 	return result;
11273 }
11274 
11275 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11276 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11277     int32_t category, int32_t subcategory)
11278 {
11279 	int err = 0;
11280 
11281 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11282 	VERIFY(flags & IFNET_LOGF_MASK);
11283 
11284 	/*
11285 	 * The logging level applies to all facilities; make sure to
11286 	 * update them all with the most current level.
11287 	 */
11288 	flags |= ifp->if_log.flags;
11289 
11290 	if (ifp->if_output_ctl != NULL) {
11291 		struct ifnet_log_params l;
11292 
11293 		bzero(&l, sizeof(l));
11294 		l.level = level;
11295 		l.flags = flags;
11296 		l.flags &= ~IFNET_LOGF_DLIL;
11297 		l.category = category;
11298 		l.subcategory = subcategory;
11299 
11300 		/* Send this request to lower layers */
11301 		if (l.flags != 0) {
11302 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11303 			    sizeof(l), &l);
11304 		}
11305 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11306 		/*
11307 		 * If targeted to the lower layers without an output
11308 		 * control callback registered on the interface, just
11309 		 * silently ignore facilities other than ours.
11310 		 */
11311 		flags &= IFNET_LOGF_DLIL;
11312 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11313 			level = 0;
11314 		}
11315 	}
11316 
11317 	if (err == 0) {
11318 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11319 			ifp->if_log.flags = 0;
11320 		} else {
11321 			ifp->if_log.flags |= flags;
11322 		}
11323 
11324 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11325 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11326 		    ifp->if_log.level, ifp->if_log.flags,
11327 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11328 		    category, subcategory);
11329 	}
11330 
11331 	return err;
11332 }
11333 
11334 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11335 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11336     int32_t *category, int32_t *subcategory)
11337 {
11338 	if (level != NULL) {
11339 		*level = ifp->if_log.level;
11340 	}
11341 	if (flags != NULL) {
11342 		*flags = ifp->if_log.flags;
11343 	}
11344 	if (category != NULL) {
11345 		*category = ifp->if_log.category;
11346 	}
11347 	if (subcategory != NULL) {
11348 		*subcategory = ifp->if_log.subcategory;
11349 	}
11350 
11351 	return 0;
11352 }
11353 
11354 int
ifnet_notify_address(struct ifnet * ifp,int af)11355 ifnet_notify_address(struct ifnet *ifp, int af)
11356 {
11357 	struct ifnet_notify_address_params na;
11358 
11359 #if PF
11360 	(void) pf_ifaddr_hook(ifp);
11361 #endif /* PF */
11362 
11363 	if (ifp->if_output_ctl == NULL) {
11364 		return EOPNOTSUPP;
11365 	}
11366 
11367 	bzero(&na, sizeof(na));
11368 	na.address_family = (sa_family_t)af;
11369 
11370 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11371 	           sizeof(na), &na);
11372 }
11373 
11374 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11375 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11376 {
11377 	if (ifp == NULL || flowid == NULL) {
11378 		return EINVAL;
11379 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11380 	    !IF_FULLY_ATTACHED(ifp)) {
11381 		return ENXIO;
11382 	}
11383 
11384 	*flowid = ifp->if_flowhash;
11385 
11386 	return 0;
11387 }
11388 
11389 errno_t
ifnet_disable_output(struct ifnet * ifp)11390 ifnet_disable_output(struct ifnet *ifp)
11391 {
11392 	int err;
11393 
11394 	if (ifp == NULL) {
11395 		return EINVAL;
11396 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11397 	    !IF_FULLY_ATTACHED(ifp)) {
11398 		return ENXIO;
11399 	}
11400 
11401 	if ((err = ifnet_fc_add(ifp)) == 0) {
11402 		lck_mtx_lock_spin(&ifp->if_start_lock);
11403 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11404 		lck_mtx_unlock(&ifp->if_start_lock);
11405 	}
11406 	return err;
11407 }
11408 
11409 errno_t
ifnet_enable_output(struct ifnet * ifp)11410 ifnet_enable_output(struct ifnet *ifp)
11411 {
11412 	if (ifp == NULL) {
11413 		return EINVAL;
11414 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11415 	    !IF_FULLY_ATTACHED(ifp)) {
11416 		return ENXIO;
11417 	}
11418 
11419 	ifnet_start_common(ifp, TRUE);
11420 	return 0;
11421 }
11422 
11423 void
ifnet_flowadv(uint32_t flowhash)11424 ifnet_flowadv(uint32_t flowhash)
11425 {
11426 	struct ifnet_fc_entry *ifce;
11427 	struct ifnet *ifp;
11428 
11429 	ifce = ifnet_fc_get(flowhash);
11430 	if (ifce == NULL) {
11431 		return;
11432 	}
11433 
11434 	VERIFY(ifce->ifce_ifp != NULL);
11435 	ifp = ifce->ifce_ifp;
11436 
11437 	/* flow hash gets recalculated per attach, so check */
11438 	if (ifnet_is_attached(ifp, 1)) {
11439 		if (ifp->if_flowhash == flowhash) {
11440 			(void) ifnet_enable_output(ifp);
11441 		}
11442 		ifnet_decr_iorefcnt(ifp);
11443 	}
11444 	ifnet_fc_entry_free(ifce);
11445 }
11446 
11447 /*
11448  * Function to compare ifnet_fc_entries in ifnet flow control tree
11449  */
11450 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11451 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11452 {
11453 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11454 }
11455 
11456 static int
ifnet_fc_add(struct ifnet * ifp)11457 ifnet_fc_add(struct ifnet *ifp)
11458 {
11459 	struct ifnet_fc_entry keyfc, *ifce;
11460 	uint32_t flowhash;
11461 
11462 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11463 	VERIFY(ifp->if_flowhash != 0);
11464 	flowhash = ifp->if_flowhash;
11465 
11466 	bzero(&keyfc, sizeof(keyfc));
11467 	keyfc.ifce_flowhash = flowhash;
11468 
11469 	lck_mtx_lock_spin(&ifnet_fc_lock);
11470 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11471 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11472 		/* Entry is already in ifnet_fc_tree, return */
11473 		lck_mtx_unlock(&ifnet_fc_lock);
11474 		return 0;
11475 	}
11476 
11477 	if (ifce != NULL) {
11478 		/*
11479 		 * There is a different fc entry with the same flow hash
11480 		 * but different ifp pointer.  There can be a collision
11481 		 * on flow hash but the probability is low.  Let's just
11482 		 * avoid adding a second one when there is a collision.
11483 		 */
11484 		lck_mtx_unlock(&ifnet_fc_lock);
11485 		return EAGAIN;
11486 	}
11487 
11488 	/* become regular mutex */
11489 	lck_mtx_convert_spin(&ifnet_fc_lock);
11490 
11491 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11492 	ifce->ifce_flowhash = flowhash;
11493 	ifce->ifce_ifp = ifp;
11494 
11495 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11496 	lck_mtx_unlock(&ifnet_fc_lock);
11497 	return 0;
11498 }
11499 
11500 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11501 ifnet_fc_get(uint32_t flowhash)
11502 {
11503 	struct ifnet_fc_entry keyfc, *ifce;
11504 	struct ifnet *ifp;
11505 
11506 	bzero(&keyfc, sizeof(keyfc));
11507 	keyfc.ifce_flowhash = flowhash;
11508 
11509 	lck_mtx_lock_spin(&ifnet_fc_lock);
11510 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11511 	if (ifce == NULL) {
11512 		/* Entry is not present in ifnet_fc_tree, return */
11513 		lck_mtx_unlock(&ifnet_fc_lock);
11514 		return NULL;
11515 	}
11516 
11517 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11518 
11519 	VERIFY(ifce->ifce_ifp != NULL);
11520 	ifp = ifce->ifce_ifp;
11521 
11522 	/* become regular mutex */
11523 	lck_mtx_convert_spin(&ifnet_fc_lock);
11524 
11525 	if (!ifnet_is_attached(ifp, 0)) {
11526 		/*
11527 		 * This ifp is not attached or in the process of being
11528 		 * detached; just don't process it.
11529 		 */
11530 		ifnet_fc_entry_free(ifce);
11531 		ifce = NULL;
11532 	}
11533 	lck_mtx_unlock(&ifnet_fc_lock);
11534 
11535 	return ifce;
11536 }
11537 
11538 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11539 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11540 {
11541 	zfree(ifnet_fc_zone, ifce);
11542 }
11543 
11544 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11545 ifnet_calc_flowhash(struct ifnet *ifp)
11546 {
11547 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11548 	uint32_t flowhash = 0;
11549 
11550 	if (ifnet_flowhash_seed == 0) {
11551 		ifnet_flowhash_seed = RandomULong();
11552 	}
11553 
11554 	bzero(&fh, sizeof(fh));
11555 
11556 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11557 	fh.ifk_unit = ifp->if_unit;
11558 	fh.ifk_flags = ifp->if_flags;
11559 	fh.ifk_eflags = ifp->if_eflags;
11560 	fh.ifk_capabilities = ifp->if_capabilities;
11561 	fh.ifk_capenable = ifp->if_capenable;
11562 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11563 	fh.ifk_rand1 = RandomULong();
11564 	fh.ifk_rand2 = RandomULong();
11565 
11566 try_again:
11567 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11568 	if (flowhash == 0) {
11569 		/* try to get a non-zero flowhash */
11570 		ifnet_flowhash_seed = RandomULong();
11571 		goto try_again;
11572 	}
11573 
11574 	return flowhash;
11575 }
11576 
11577 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11578 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11579     uint16_t flags, uint8_t *data)
11580 {
11581 #pragma unused(flags)
11582 	int error = 0;
11583 
11584 	switch (family) {
11585 	case AF_INET:
11586 		if_inetdata_lock_exclusive(ifp);
11587 		if (IN_IFEXTRA(ifp) != NULL) {
11588 			if (len == 0) {
11589 				/* Allow clearing the signature */
11590 				IN_IFEXTRA(ifp)->netsig_len = 0;
11591 				bzero(IN_IFEXTRA(ifp)->netsig,
11592 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11593 				if_inetdata_lock_done(ifp);
11594 				break;
11595 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11596 				error = EINVAL;
11597 				if_inetdata_lock_done(ifp);
11598 				break;
11599 			}
11600 			IN_IFEXTRA(ifp)->netsig_len = len;
11601 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11602 		} else {
11603 			error = ENOMEM;
11604 		}
11605 		if_inetdata_lock_done(ifp);
11606 		break;
11607 
11608 	case AF_INET6:
11609 		if_inet6data_lock_exclusive(ifp);
11610 		if (IN6_IFEXTRA(ifp) != NULL) {
11611 			if (len == 0) {
11612 				/* Allow clearing the signature */
11613 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11614 				bzero(IN6_IFEXTRA(ifp)->netsig,
11615 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11616 				if_inet6data_lock_done(ifp);
11617 				break;
11618 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11619 				error = EINVAL;
11620 				if_inet6data_lock_done(ifp);
11621 				break;
11622 			}
11623 			IN6_IFEXTRA(ifp)->netsig_len = len;
11624 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11625 		} else {
11626 			error = ENOMEM;
11627 		}
11628 		if_inet6data_lock_done(ifp);
11629 		break;
11630 
11631 	default:
11632 		error = EINVAL;
11633 		break;
11634 	}
11635 
11636 	return error;
11637 }
11638 
11639 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11640 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11641     uint16_t *flags, uint8_t *data)
11642 {
11643 	int error = 0;
11644 
11645 	if (ifp == NULL || len == NULL || data == NULL) {
11646 		return EINVAL;
11647 	}
11648 
11649 	switch (family) {
11650 	case AF_INET:
11651 		if_inetdata_lock_shared(ifp);
11652 		if (IN_IFEXTRA(ifp) != NULL) {
11653 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11654 				error = EINVAL;
11655 				if_inetdata_lock_done(ifp);
11656 				break;
11657 			}
11658 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11659 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11660 			} else {
11661 				error = ENOENT;
11662 			}
11663 		} else {
11664 			error = ENOMEM;
11665 		}
11666 		if_inetdata_lock_done(ifp);
11667 		break;
11668 
11669 	case AF_INET6:
11670 		if_inet6data_lock_shared(ifp);
11671 		if (IN6_IFEXTRA(ifp) != NULL) {
11672 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11673 				error = EINVAL;
11674 				if_inet6data_lock_done(ifp);
11675 				break;
11676 			}
11677 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11678 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11679 			} else {
11680 				error = ENOENT;
11681 			}
11682 		} else {
11683 			error = ENOMEM;
11684 		}
11685 		if_inet6data_lock_done(ifp);
11686 		break;
11687 
11688 	default:
11689 		error = EINVAL;
11690 		break;
11691 	}
11692 
11693 	if (error == 0 && flags != NULL) {
11694 		*flags = 0;
11695 	}
11696 
11697 	return error;
11698 }
11699 
11700 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11701 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11702 {
11703 	int i, error = 0, one_set = 0;
11704 
11705 	if_inet6data_lock_exclusive(ifp);
11706 
11707 	if (IN6_IFEXTRA(ifp) == NULL) {
11708 		error = ENOMEM;
11709 		goto out;
11710 	}
11711 
11712 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11713 		uint32_t prefix_len =
11714 		    prefixes[i].prefix_len;
11715 		struct in6_addr *prefix =
11716 		    &prefixes[i].ipv6_prefix;
11717 
11718 		if (prefix_len == 0) {
11719 			clat_log0((LOG_DEBUG,
11720 			    "NAT64 prefixes purged from Interface %s\n",
11721 			    if_name(ifp)));
11722 			/* Allow clearing the signature */
11723 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11724 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11725 			    sizeof(struct in6_addr));
11726 
11727 			continue;
11728 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11729 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11730 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11731 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11732 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11733 		    prefix_len != NAT64_PREFIX_LEN_96) {
11734 			clat_log0((LOG_DEBUG,
11735 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11736 			error = EINVAL;
11737 			goto out;
11738 		}
11739 
11740 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11741 			clat_log0((LOG_DEBUG,
11742 			    "NAT64 prefix has interface/link local scope.\n"));
11743 			error = EINVAL;
11744 			goto out;
11745 		}
11746 
11747 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11748 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11749 		    sizeof(struct in6_addr));
11750 		clat_log0((LOG_DEBUG,
11751 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11752 		    ip6_sprintf(prefix), prefix_len));
11753 		one_set = 1;
11754 	}
11755 
11756 out:
11757 	if_inet6data_lock_done(ifp);
11758 
11759 	if (error == 0 && one_set != 0) {
11760 		necp_update_all_clients();
11761 	}
11762 
11763 	return error;
11764 }
11765 
11766 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11767 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11768 {
11769 	int i, found_one = 0, error = 0;
11770 
11771 	if (ifp == NULL) {
11772 		return EINVAL;
11773 	}
11774 
11775 	if_inet6data_lock_shared(ifp);
11776 
11777 	if (IN6_IFEXTRA(ifp) == NULL) {
11778 		error = ENOMEM;
11779 		goto out;
11780 	}
11781 
11782 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11783 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11784 			found_one = 1;
11785 		}
11786 	}
11787 
11788 	if (found_one == 0) {
11789 		error = ENOENT;
11790 		goto out;
11791 	}
11792 
11793 	if (prefixes) {
11794 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11795 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11796 	}
11797 
11798 out:
11799 	if_inet6data_lock_done(ifp);
11800 
11801 	return error;
11802 }
11803 
11804 __attribute__((noinline))
11805 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11806 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11807     protocol_family_t pf)
11808 {
11809 #pragma unused(ifp)
11810 	uint32_t did_sw;
11811 
11812 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11813 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11814 		return;
11815 	}
11816 
11817 	switch (pf) {
11818 	case PF_INET:
11819 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11820 		if (did_sw & CSUM_DELAY_IP) {
11821 			hwcksum_dbg_finalized_hdr++;
11822 		}
11823 		if (did_sw & CSUM_DELAY_DATA) {
11824 			hwcksum_dbg_finalized_data++;
11825 		}
11826 		break;
11827 	case PF_INET6:
11828 		/*
11829 		 * Checksum offload should not have been enabled when
11830 		 * extension headers exist; that also means that we
11831 		 * cannot force-finalize packets with extension headers.
11832 		 * Indicate to the callee should it skip such case by
11833 		 * setting optlen to -1.
11834 		 */
11835 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11836 		    m->m_pkthdr.csum_flags);
11837 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11838 			hwcksum_dbg_finalized_data++;
11839 		}
11840 		break;
11841 	default:
11842 		return;
11843 	}
11844 }
11845 
11846 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11847 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11848     protocol_family_t pf)
11849 {
11850 	uint16_t sum = 0;
11851 	uint32_t hlen;
11852 
11853 	if (frame_header == NULL ||
11854 	    frame_header < (char *)mbuf_datastart(m) ||
11855 	    frame_header > (char *)m->m_data) {
11856 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11857 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11858 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11859 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11860 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11861 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11862 		return;
11863 	}
11864 	hlen = (uint32_t)(m->m_data - frame_header);
11865 
11866 	switch (pf) {
11867 	case PF_INET:
11868 	case PF_INET6:
11869 		break;
11870 	default:
11871 		return;
11872 	}
11873 
11874 	/*
11875 	 * Force partial checksum offload; useful to simulate cases
11876 	 * where the hardware does not support partial checksum offload,
11877 	 * in order to validate correctness throughout the layers above.
11878 	 */
11879 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11880 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11881 
11882 		if (foff > (uint32_t)m->m_pkthdr.len) {
11883 			return;
11884 		}
11885 
11886 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11887 
11888 		/* Compute 16-bit 1's complement sum from forced offset */
11889 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11890 
11891 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11892 		m->m_pkthdr.csum_rx_val = sum;
11893 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11894 
11895 		hwcksum_dbg_partial_forced++;
11896 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11897 	}
11898 
11899 	/*
11900 	 * Partial checksum offload verification (and adjustment);
11901 	 * useful to validate and test cases where the hardware
11902 	 * supports partial checksum offload.
11903 	 */
11904 	if ((m->m_pkthdr.csum_flags &
11905 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11906 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11907 		uint32_t rxoff;
11908 
11909 		/* Start offset must begin after frame header */
11910 		rxoff = m->m_pkthdr.csum_rx_start;
11911 		if (hlen > rxoff) {
11912 			hwcksum_dbg_bad_rxoff++;
11913 			if (dlil_verbose) {
11914 				DLIL_PRINTF("%s: partial cksum start offset %d "
11915 				    "is less than frame header length %d for "
11916 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11917 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11918 			}
11919 			return;
11920 		}
11921 		rxoff -= hlen;
11922 
11923 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11924 			/*
11925 			 * Compute the expected 16-bit 1's complement sum;
11926 			 * skip this if we've already computed it above
11927 			 * when partial checksum offload is forced.
11928 			 */
11929 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11930 
11931 			/* Hardware or driver is buggy */
11932 			if (sum != m->m_pkthdr.csum_rx_val) {
11933 				hwcksum_dbg_bad_cksum++;
11934 				if (dlil_verbose) {
11935 					DLIL_PRINTF("%s: bad partial cksum value "
11936 					    "0x%x (expected 0x%x) for mbuf "
11937 					    "0x%llx [rx_start %d]\n",
11938 					    if_name(ifp),
11939 					    m->m_pkthdr.csum_rx_val, sum,
11940 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11941 					    m->m_pkthdr.csum_rx_start);
11942 				}
11943 				return;
11944 			}
11945 		}
11946 		hwcksum_dbg_verified++;
11947 
11948 		/*
11949 		 * This code allows us to emulate various hardwares that
11950 		 * perform 16-bit 1's complement sum beginning at various
11951 		 * start offset values.
11952 		 */
11953 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11954 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11955 
11956 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11957 				return;
11958 			}
11959 
11960 			sum = m_adj_sum16(m, rxoff, aoff,
11961 			    m_pktlen(m) - aoff, sum);
11962 
11963 			m->m_pkthdr.csum_rx_val = sum;
11964 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11965 
11966 			hwcksum_dbg_adjusted++;
11967 		}
11968 	}
11969 }
11970 
11971 static int
11972 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11973 {
11974 #pragma unused(arg1, arg2)
11975 	u_int32_t i;
11976 	int err;
11977 
11978 	i = hwcksum_dbg_mode;
11979 
11980 	err = sysctl_handle_int(oidp, &i, 0, req);
11981 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11982 		return err;
11983 	}
11984 
11985 	if (hwcksum_dbg == 0) {
11986 		return ENODEV;
11987 	}
11988 
11989 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
11990 		return EINVAL;
11991 	}
11992 
11993 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
11994 
11995 	return err;
11996 }
11997 
11998 static int
11999 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12000 {
12001 #pragma unused(arg1, arg2)
12002 	u_int32_t i;
12003 	int err;
12004 
12005 	i = hwcksum_dbg_partial_rxoff_forced;
12006 
12007 	err = sysctl_handle_int(oidp, &i, 0, req);
12008 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12009 		return err;
12010 	}
12011 
12012 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12013 		return ENODEV;
12014 	}
12015 
12016 	hwcksum_dbg_partial_rxoff_forced = i;
12017 
12018 	return err;
12019 }
12020 
12021 static int
12022 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12023 {
12024 #pragma unused(arg1, arg2)
12025 	u_int32_t i;
12026 	int err;
12027 
12028 	i = hwcksum_dbg_partial_rxoff_adj;
12029 
12030 	err = sysctl_handle_int(oidp, &i, 0, req);
12031 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12032 		return err;
12033 	}
12034 
12035 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12036 		return ENODEV;
12037 	}
12038 
12039 	hwcksum_dbg_partial_rxoff_adj = i;
12040 
12041 	return err;
12042 }
12043 
12044 static int
12045 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12046 {
12047 #pragma unused(oidp, arg1, arg2)
12048 	int err;
12049 
12050 	if (req->oldptr == USER_ADDR_NULL) {
12051 	}
12052 	if (req->newptr != USER_ADDR_NULL) {
12053 		return EPERM;
12054 	}
12055 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12056 	    sizeof(struct chain_len_stats));
12057 
12058 	return err;
12059 }
12060 
12061 
12062 #if DEBUG || DEVELOPMENT
12063 /* Blob for sum16 verification */
12064 static uint8_t sumdata[] = {
12065 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12066 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12067 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12068 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12069 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12070 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12071 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12072 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12073 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12074 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12075 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12076 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12077 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12078 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12079 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12080 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12081 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12082 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12083 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12084 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12085 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12086 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12087 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12088 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12089 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12090 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12091 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12092 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12093 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12094 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12095 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12096 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12097 	0xc8, 0x28, 0x02, 0x00, 0x00
12098 };
12099 
12100 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12101 static struct {
12102 	boolean_t       init;
12103 	uint16_t        len;
12104 	uint16_t        sumr;   /* reference */
12105 	uint16_t        sumrp;  /* reference, precomputed */
12106 } sumtbl[] = {
12107 	{ FALSE, 0, 0, 0x0000 },
12108 	{ FALSE, 1, 0, 0x001f },
12109 	{ FALSE, 2, 0, 0x8b1f },
12110 	{ FALSE, 3, 0, 0x8b27 },
12111 	{ FALSE, 7, 0, 0x790e },
12112 	{ FALSE, 11, 0, 0xcb6d },
12113 	{ FALSE, 20, 0, 0x20dd },
12114 	{ FALSE, 27, 0, 0xbabd },
12115 	{ FALSE, 32, 0, 0xf3e8 },
12116 	{ FALSE, 37, 0, 0x197d },
12117 	{ FALSE, 43, 0, 0x9eae },
12118 	{ FALSE, 64, 0, 0x4678 },
12119 	{ FALSE, 127, 0, 0x9399 },
12120 	{ FALSE, 256, 0, 0xd147 },
12121 	{ FALSE, 325, 0, 0x0358 },
12122 };
12123 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12124 
12125 static void
dlil_verify_sum16(void)12126 dlil_verify_sum16(void)
12127 {
12128 	struct mbuf *m;
12129 	uint8_t *buf;
12130 	int n;
12131 
12132 	/* Make sure test data plus extra room for alignment fits in cluster */
12133 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12134 
12135 	kprintf("DLIL: running SUM16 self-tests ... ");
12136 
12137 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12138 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12139 
12140 	buf = mtod(m, uint8_t *);               /* base address */
12141 
12142 	for (n = 0; n < SUMTBL_MAX; n++) {
12143 		uint16_t len = sumtbl[n].len;
12144 		int i;
12145 
12146 		/* Verify for all possible alignments */
12147 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12148 			uint16_t sum, sumr;
12149 			uint8_t *c;
12150 
12151 			/* Copy over test data to mbuf */
12152 			VERIFY(len <= sizeof(sumdata));
12153 			c = buf + i;
12154 			bcopy(sumdata, c, len);
12155 
12156 			/* Zero-offset test (align by data pointer) */
12157 			m->m_data = (caddr_t)c;
12158 			m->m_len = len;
12159 			sum = m_sum16(m, 0, len);
12160 
12161 			if (!sumtbl[n].init) {
12162 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12163 				sumtbl[n].sumr = sumr;
12164 				sumtbl[n].init = TRUE;
12165 			} else {
12166 				sumr = sumtbl[n].sumr;
12167 			}
12168 
12169 			/* Something is horribly broken; stop now */
12170 			if (sumr != sumtbl[n].sumrp) {
12171 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12172 				    "for len=%d align=%d sum=0x%04x "
12173 				    "[expected=0x%04x]\n", __func__,
12174 				    len, i, sum, sumr);
12175 				/* NOTREACHED */
12176 			} else if (sum != sumr) {
12177 				panic_plain("\n%s: broken m_sum16() for len=%d "
12178 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12179 				    __func__, len, i, sum, sumr);
12180 				/* NOTREACHED */
12181 			}
12182 
12183 			/* Alignment test by offset (fixed data pointer) */
12184 			m->m_data = (caddr_t)buf;
12185 			m->m_len = i + len;
12186 			sum = m_sum16(m, i, len);
12187 
12188 			/* Something is horribly broken; stop now */
12189 			if (sum != sumr) {
12190 				panic_plain("\n%s: broken m_sum16() for len=%d "
12191 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12192 				    __func__, len, i, sum, sumr);
12193 				/* NOTREACHED */
12194 			}
12195 #if INET
12196 			/* Simple sum16 contiguous buffer test by aligment */
12197 			sum = b_sum16(c, len);
12198 
12199 			/* Something is horribly broken; stop now */
12200 			if (sum != sumr) {
12201 				panic_plain("\n%s: broken b_sum16() for len=%d "
12202 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12203 				    __func__, len, i, sum, sumr);
12204 				/* NOTREACHED */
12205 			}
12206 #endif /* INET */
12207 		}
12208 	}
12209 	m_freem(m);
12210 
12211 	kprintf("PASSED\n");
12212 }
12213 #endif /* DEBUG || DEVELOPMENT */
12214 
12215 #define CASE_STRINGIFY(x) case x: return #x
12216 
12217 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12218 dlil_kev_dl_code_str(u_int32_t event_code)
12219 {
12220 	switch (event_code) {
12221 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12222 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12223 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12224 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12225 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12226 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12227 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12228 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12229 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12230 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12231 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12232 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12233 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12234 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12235 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12236 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12237 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12238 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12239 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12240 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12241 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12242 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12243 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12244 		CASE_STRINGIFY(KEV_DL_ISSUES);
12245 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12246 	default:
12247 		break;
12248 	}
12249 	return "";
12250 }
12251 
12252 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12253 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12254 {
12255 #pragma unused(arg1)
12256 	struct ifnet *ifp = arg0;
12257 
12258 	if (ifnet_is_attached(ifp, 1)) {
12259 		nstat_ifnet_threshold_reached(ifp->if_index);
12260 		ifnet_decr_iorefcnt(ifp);
12261 	}
12262 }
12263 
12264 void
ifnet_notify_data_threshold(struct ifnet * ifp)12265 ifnet_notify_data_threshold(struct ifnet *ifp)
12266 {
12267 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12268 	uint64_t oldbytes = ifp->if_dt_bytes;
12269 
12270 	ASSERT(ifp->if_dt_tcall != NULL);
12271 
12272 	/*
12273 	 * If we went over the threshold, notify NetworkStatistics.
12274 	 * We rate-limit it based on the threshold interval value.
12275 	 */
12276 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12277 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12278 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12279 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12280 		uint64_t now = mach_absolute_time(), deadline = now;
12281 		uint64_t ival;
12282 
12283 		if (tival != 0) {
12284 			nanoseconds_to_absolutetime(tival, &ival);
12285 			clock_deadline_for_periodic_event(ival, now, &deadline);
12286 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12287 			    deadline);
12288 		} else {
12289 			(void) thread_call_enter(ifp->if_dt_tcall);
12290 		}
12291 	}
12292 }
12293 
12294 #if (DEVELOPMENT || DEBUG)
12295 /*
12296  * The sysctl variable name contains the input parameters of
12297  * ifnet_get_keepalive_offload_frames()
12298  *  ifp (interface index): name[0]
12299  *  frames_array_count:    name[1]
12300  *  frame_data_offset:     name[2]
12301  * The return length gives used_frames_count
12302  */
12303 static int
12304 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12305 {
12306 #pragma unused(oidp)
12307 	int *name = (int *)arg1;
12308 	u_int namelen = arg2;
12309 	int idx;
12310 	ifnet_t ifp = NULL;
12311 	u_int32_t frames_array_count;
12312 	size_t frame_data_offset;
12313 	u_int32_t used_frames_count;
12314 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12315 	int error = 0;
12316 	u_int32_t i;
12317 
12318 	/*
12319 	 * Only root can get look at other people TCP frames
12320 	 */
12321 	error = proc_suser(current_proc());
12322 	if (error != 0) {
12323 		goto done;
12324 	}
12325 	/*
12326 	 * Validate the input parameters
12327 	 */
12328 	if (req->newptr != USER_ADDR_NULL) {
12329 		error = EPERM;
12330 		goto done;
12331 	}
12332 	if (namelen != 3) {
12333 		error = EINVAL;
12334 		goto done;
12335 	}
12336 	if (req->oldptr == USER_ADDR_NULL) {
12337 		error = EINVAL;
12338 		goto done;
12339 	}
12340 	if (req->oldlen == 0) {
12341 		error = EINVAL;
12342 		goto done;
12343 	}
12344 	idx = name[0];
12345 	frames_array_count = name[1];
12346 	frame_data_offset = name[2];
12347 
12348 	/* Make sure the passed buffer is large enough */
12349 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12350 	    req->oldlen) {
12351 		error = ENOMEM;
12352 		goto done;
12353 	}
12354 
12355 	ifnet_head_lock_shared();
12356 	if (!IF_INDEX_IN_RANGE(idx)) {
12357 		ifnet_head_done();
12358 		error = ENOENT;
12359 		goto done;
12360 	}
12361 	ifp = ifindex2ifnet[idx];
12362 	ifnet_head_done();
12363 
12364 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12365 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12366 		Z_WAITOK);
12367 	if (frames_array == NULL) {
12368 		error = ENOMEM;
12369 		goto done;
12370 	}
12371 
12372 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12373 	    frames_array_count, frame_data_offset, &used_frames_count);
12374 	if (error != 0) {
12375 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12376 		    __func__, error);
12377 		goto done;
12378 	}
12379 
12380 	for (i = 0; i < used_frames_count; i++) {
12381 		error = SYSCTL_OUT(req, frames_array + i,
12382 		    sizeof(struct ifnet_keepalive_offload_frame));
12383 		if (error != 0) {
12384 			goto done;
12385 		}
12386 	}
12387 done:
12388 	if (frames_array != NULL) {
12389 		kfree_data(frames_array, frames_array_count *
12390 		    sizeof(struct ifnet_keepalive_offload_frame));
12391 	}
12392 	return error;
12393 }
12394 #endif /* DEVELOPMENT || DEBUG */
12395 
12396 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12397 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12398     struct ifnet *ifp)
12399 {
12400 	tcp_update_stats_per_flow(ifs, ifp);
12401 }
12402 
12403 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12404 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12405 {
12406 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12407 }
12408 
12409 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12410 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12411 {
12412 	OSBitAndAtomic(~clear_flags, flags_p);
12413 }
12414 
12415 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12416 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12417 {
12418 	return _set_flags(&interface->if_eflags, set_flags);
12419 }
12420 
12421 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12422 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12423 {
12424 	_clear_flags(&interface->if_eflags, clear_flags);
12425 }
12426 
12427 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12428 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12429 {
12430 	return _set_flags(&interface->if_xflags, set_flags);
12431 }
12432 
12433 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12434 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12435 {
12436 	_clear_flags(&interface->if_xflags, clear_flags);
12437 }
12438 
12439 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12440 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12441 {
12442 	atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12443 }
12444 
12445 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12446 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12447 {
12448 	if (*genid != ifp->if_traffic_rule_genid) {
12449 		*genid = ifp->if_traffic_rule_genid;
12450 		return TRUE;
12451 	}
12452 	return FALSE;
12453 }
12454 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12455 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12456 {
12457 	atomic_set_32(&ifp->if_traffic_rule_count, count);
12458 	ifnet_update_traffic_rule_genid(ifp);
12459 }
12460 
12461 static void
log_hexdump(void * data,size_t len)12462 log_hexdump(void *data, size_t len)
12463 {
12464 	size_t i, j, k;
12465 	unsigned char *ptr = (unsigned char *)data;
12466 #define MAX_DUMP_BUF 32
12467 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12468 
12469 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12470 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12471 			unsigned char msnbl = ptr[j] >> 4;
12472 			unsigned char lsnbl = ptr[j] & 0x0f;
12473 
12474 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12475 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12476 
12477 			if ((j % 2) == 1) {
12478 				buf[k++] = ' ';
12479 			}
12480 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12481 				buf[k++] = ' ';
12482 			}
12483 		}
12484 		buf[k] = 0;
12485 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12486 	}
12487 }
12488 
12489 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12490 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12491 net_check_compatible_if_filter(struct ifnet *ifp)
12492 {
12493 	if (ifp == NULL) {
12494 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12495 			return false;
12496 		}
12497 	} else {
12498 		if (ifp->if_flt_non_os_count > 0) {
12499 			return false;
12500 		}
12501 	}
12502 	return true;
12503 }
12504 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12505 
12506 #define DUMP_BUF_CHK() {        \
12507 	clen -= k;              \
12508 	if (clen < 1)           \
12509 	        goto done;      \
12510 	c += k;                 \
12511 }
12512 
12513 int dlil_dump_top_if_qlen(char *, int);
12514 int
dlil_dump_top_if_qlen(char * str,int str_len)12515 dlil_dump_top_if_qlen(char *str, int str_len)
12516 {
12517 	char *c = str;
12518 	int k, clen = str_len;
12519 	struct ifnet *top_ifcq_ifp = NULL;
12520 	uint32_t top_ifcq_len = 0;
12521 	struct ifnet *top_inq_ifp = NULL;
12522 	uint32_t top_inq_len = 0;
12523 
12524 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12525 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12526 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12527 
12528 		if (ifp == NULL) {
12529 			continue;
12530 		}
12531 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12532 			top_ifcq_len = ifp->if_snd->ifcq_len;
12533 			top_ifcq_ifp = ifp;
12534 		}
12535 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12536 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12537 			top_inq_ifp = ifp;
12538 		}
12539 	}
12540 
12541 	if (top_ifcq_ifp != NULL) {
12542 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12543 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12544 		DUMP_BUF_CHK();
12545 	}
12546 	if (top_inq_ifp != NULL) {
12547 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12548 		    top_inq_len, top_inq_ifp->if_xname);
12549 		DUMP_BUF_CHK();
12550 	}
12551 done:
12552 	return str_len - clen;
12553 }
12554 
12555 #if DEVELOPMENT || DEBUG
12556 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12557 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12558 {
12559 	struct flow_key key = {};
12560 	int error = 0;
12561 
12562 	if (req->newptr == USER_ADDR_NULL) {
12563 		return EINVAL;
12564 	}
12565 	if (req->newlen < sizeof(struct flow_key)) {
12566 		return EINVAL;
12567 	}
12568 	error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12569 	if (error != 0) {
12570 		return error;
12571 	}
12572 
12573 	switch (key.fk_ipver) {
12574 	case IPVERSION:
12575 		if (key.fk_proto != IPPROTO_UDP ||
12576 		    key.fk_sport == 0 || key.fk_dport == 0) {
12577 			return EINVAL;
12578 		}
12579 
12580 		if (key.fk_src4.s_addr == INADDR_ANY ||
12581 		    key.fk_dst4.s_addr == INADDR_ANY) {
12582 			return EINVAL;
12583 		}
12584 
12585 		break;
12586 	case IPV6_VERSION:
12587 		if (key.fk_proto != IPPROTO_UDP ||
12588 		    key.fk_sport == 0 || key.fk_dport == 0) {
12589 			return EINVAL;
12590 		}
12591 
12592 		if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12593 		    IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12594 			return EINVAL;
12595 		}
12596 
12597 		break;
12598 	case 0:
12599 		if (key.fk_proto != 0 ||
12600 		    key.fk_sport != 0 || key.fk_dport != 0) {
12601 			return EINVAL;
12602 		}
12603 
12604 		if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12605 		    !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12606 			return EINVAL;
12607 		}
12608 
12609 		break;
12610 	default:
12611 		return EINVAL;
12612 	}
12613 
12614 	memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12615 	return 0;
12616 }
12617 #endif /* DEVELOPMENT || DEBUG */
12618