xref: /xnu-8792.81.2/bsd/net/dlil.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 1999-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define IFNET_KTRACE_TX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x002)
154 
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR        4 /* LONGWORDS */
157 
158 
159 #if 1
160 #define DLIL_PRINTF     printf
161 #else
162 #define DLIL_PRINTF     kprintf
163 #endif
164 
165 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
166 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167 
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
169 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170 
171 enum {
172 	kProtoKPI_v1    = 1,
173 	kProtoKPI_v2    = 2
174 };
175 
176 /*
177  * List of if_proto structures in if_proto_hash[] is protected by
178  * the ifnet lock.  The rest of the fields are initialized at protocol
179  * attach time and never change, thus no lock required as long as
180  * a reference to it is valid, via if_proto_ref().
181  */
182 struct if_proto {
183 	SLIST_ENTRY(if_proto)       next_hash;
184 	u_int32_t                   refcount;
185 	u_int32_t                   detached;
186 	struct ifnet                *ifp;
187 	protocol_family_t           protocol_family;
188 	int                         proto_kpi;
189 	union {
190 		struct {
191 			proto_media_input               input;
192 			proto_media_preout              pre_output;
193 			proto_media_event               event;
194 			proto_media_ioctl               ioctl;
195 			proto_media_detached            detached;
196 			proto_media_resolve_multi       resolve_multi;
197 			proto_media_send_arp            send_arp;
198 		} v1;
199 		struct {
200 			proto_media_input_v2            input;
201 			proto_media_preout              pre_output;
202 			proto_media_event               event;
203 			proto_media_ioctl               ioctl;
204 			proto_media_detached            detached;
205 			proto_media_resolve_multi       resolve_multi;
206 			proto_media_send_arp            send_arp;
207 		} v2;
208 	} kpi;
209 };
210 
211 SLIST_HEAD(proto_hash_entry, if_proto);
212 
213 #define DLIL_SDLDATALEN \
214 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215 
216 struct dlil_ifnet {
217 	struct ifnet    dl_if;                  /* public ifnet */
218 	/*
219 	 * DLIL private fields, protected by dl_if_lock
220 	 */
221 	decl_lck_mtx_data(, dl_if_lock);
222 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
223 	u_int32_t dl_if_flags;                  /* flags (below) */
224 	u_int32_t dl_if_refcnt;                 /* refcnt */
225 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 	void    *dl_if_uniqueid;                /* unique interface id */
227 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
228 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
229 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
230 	struct {
231 		struct ifaddr   ifa;            /* lladdr ifa */
232 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 	} dl_if_lladdr;
235 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 	u_int8_t dl_if_permanent_ether_is_set;
238 	u_int8_t dl_if_unused;
239 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
241 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
242 };
243 
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG      0x4     /* has debugging info */
248 
249 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
250 
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253 
254 struct dlil_ifnet_dbg {
255 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
256 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
257 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
258 	/*
259 	 * Circular lists of ifnet_{reference,release} callers.
260 	 */
261 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264 
265 #define DLIL_TO_IFP(s)  (&s->dl_if)
266 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
267 
268 struct ifnet_filter {
269 	TAILQ_ENTRY(ifnet_filter)       filt_next;
270 	u_int32_t                       filt_skip;
271 	u_int32_t                       filt_flags;
272 	ifnet_t                         filt_ifp;
273 	const char                      *filt_name;
274 	void                            *filt_cookie;
275 	protocol_family_t               filt_protocol;
276 	iff_input_func                  filt_input;
277 	iff_output_func                 filt_output;
278 	iff_event_func                  filt_event;
279 	iff_ioctl_func                  filt_ioctl;
280 	iff_detached_func               filt_detached;
281 };
282 
283 struct proto_input_entry;
284 
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286 
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288 
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294 
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297     &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299     &dlil_lck_attributes);
300 
301 #if DEBUG
302 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug;        /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
308 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
310 
311 static ZONE_DEFINE(dlif_filt_zone, "ifnet_filter",
312     sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
313 
314 static ZONE_DEFINE(dlif_phash_zone, "ifnet_proto_hash",
315     sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
316 
317 static ZONE_DEFINE(dlif_proto_zone, "ifnet_proto",
318     sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
319 
320 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
321 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
322 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
323 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
324 
325 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
326 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
327 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
328 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
329 
330 static u_int32_t net_rtref;
331 
332 static struct dlil_main_threading_info dlil_main_input_thread_info;
333 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
334     (struct dlil_threading_info *)&dlil_main_input_thread_info;
335 
336 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
337 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
338 static void dlil_if_trace(struct dlil_ifnet *, int);
339 static void if_proto_ref(struct if_proto *);
340 static void if_proto_free(struct if_proto *);
341 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
342 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
343     u_int32_t list_count);
344 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
345 static void if_flt_monitor_busy(struct ifnet *);
346 static void if_flt_monitor_unbusy(struct ifnet *);
347 static void if_flt_monitor_enter(struct ifnet *);
348 static void if_flt_monitor_leave(struct ifnet *);
349 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
350     char **, protocol_family_t);
351 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
352     protocol_family_t);
353 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
354     const struct sockaddr_dl *);
355 static int ifnet_lookup(struct ifnet *);
356 static void if_purgeaddrs(struct ifnet *);
357 
358 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
359     struct mbuf *, char *);
360 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
361     struct mbuf *);
362 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
363     mbuf_t *, const struct sockaddr *, void *, char *, char *);
364 static void ifproto_media_event(struct ifnet *, protocol_family_t,
365     const struct kev_msg *);
366 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
367     unsigned long, void *);
368 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
369     struct sockaddr_dl *, size_t);
370 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
371     const struct sockaddr_dl *, const struct sockaddr *,
372     const struct sockaddr_dl *, const struct sockaddr *);
373 
374 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
375     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
376     boolean_t poll, struct thread *tp);
377 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
378     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
379 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
380 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
381     protocol_family_t *);
382 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
383     const struct ifnet_demux_desc *, u_int32_t);
384 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
385 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
386 #if !XNU_TARGET_OS_OSX
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388     const struct sockaddr *, const char *, const char *,
389     u_int32_t *, u_int32_t *);
390 #else /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
392     const struct sockaddr *, const char *, const char *);
393 #endif /* XNU_TARGET_OS_OSX */
394 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
395     const struct sockaddr *, const char *, const char *,
396     u_int32_t *, u_int32_t *);
397 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
398 static void ifp_if_free(struct ifnet *);
399 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
400 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
401 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
402 
403 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
404     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405     boolean_t, struct thread *);
406 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
407     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
408     boolean_t, struct thread *);
409 
410 static void dlil_main_input_thread_func(void *, wait_result_t);
411 static void dlil_main_input_thread_cont(void *, wait_result_t);
412 
413 static void dlil_input_thread_func(void *, wait_result_t);
414 static void dlil_input_thread_cont(void *, wait_result_t);
415 
416 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
417 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
418 
419 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
420     thread_continue_t *);
421 static void dlil_terminate_input_thread(struct dlil_threading_info *);
422 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
423     struct dlil_threading_info *, struct ifnet *, boolean_t);
424 static boolean_t dlil_input_stats_sync(struct ifnet *,
425     struct dlil_threading_info *);
426 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
427     u_int32_t, ifnet_model_t, boolean_t);
428 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
429     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
430 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
431 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
432 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
433 #if DEBUG || DEVELOPMENT
434 static void dlil_verify_sum16(void);
435 #endif /* DEBUG || DEVELOPMENT */
436 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
437     protocol_family_t);
438 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
439     protocol_family_t);
440 
441 static void dlil_incr_pending_thread_count(void);
442 static void dlil_decr_pending_thread_count(void);
443 
444 static void ifnet_detacher_thread_func(void *, wait_result_t);
445 static void ifnet_detacher_thread_cont(void *, wait_result_t);
446 static void ifnet_detach_final(struct ifnet *);
447 static void ifnet_detaching_enqueue(struct ifnet *);
448 static struct ifnet *ifnet_detaching_dequeue(void);
449 
450 static void ifnet_start_thread_func(void *, wait_result_t);
451 static void ifnet_start_thread_cont(void *, wait_result_t);
452 
453 static void ifnet_poll_thread_func(void *, wait_result_t);
454 static void ifnet_poll_thread_cont(void *, wait_result_t);
455 
456 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
457     classq_pkt_t *, boolean_t, boolean_t *);
458 
459 static void ifp_src_route_copyout(struct ifnet *, struct route *);
460 static void ifp_src_route_copyin(struct ifnet *, struct route *);
461 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
462 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
463 
464 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
470 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
471 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
473 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
474 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
475 
476 struct chain_len_stats tx_chain_len_stats;
477 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
478 
479 #if TEST_INPUT_THREAD_TERMINATION
480 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
481 #endif /* TEST_INPUT_THREAD_TERMINATION */
482 
483 
484 /* The following are protected by dlil_ifnet_lock */
485 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
486 static u_int32_t ifnet_detaching_cnt;
487 static boolean_t ifnet_detaching_embryonic;
488 static void *ifnet_delayed_run; /* wait channel for detaching thread */
489 
490 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
491     &dlil_lck_attributes);
492 
493 static uint32_t ifnet_flowhash_seed;
494 
495 struct ifnet_flowhash_key {
496 	char            ifk_name[IFNAMSIZ];
497 	uint32_t        ifk_unit;
498 	uint32_t        ifk_flags;
499 	uint32_t        ifk_eflags;
500 	uint32_t        ifk_capabilities;
501 	uint32_t        ifk_capenable;
502 	uint32_t        ifk_output_sched_model;
503 	uint32_t        ifk_rand1;
504 	uint32_t        ifk_rand2;
505 };
506 
507 /* Flow control entry per interface */
508 struct ifnet_fc_entry {
509 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
510 	u_int32_t       ifce_flowhash;
511 	struct ifnet    *ifce_ifp;
512 };
513 
514 static uint32_t ifnet_calc_flowhash(struct ifnet *);
515 static int ifce_cmp(const struct ifnet_fc_entry *,
516     const struct ifnet_fc_entry *);
517 static int ifnet_fc_add(struct ifnet *);
518 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
519 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
520 
521 /* protected by ifnet_fc_lock */
522 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
523 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
524 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
525 
526 static ZONE_DEFINE(ifnet_fc_zone, "ifnet_fc_zone",
527     sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
528 
529 extern void bpfdetach(struct ifnet *);
530 extern void proto_input_run(void);
531 
532 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
533     u_int32_t flags);
534 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
535     u_int32_t flags);
536 
537 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
538 
539 #if CONFIG_MACF
540 #if !XNU_TARGET_OS_OSX
541 int dlil_lladdr_ckreq = 1;
542 #else /* XNU_TARGET_OS_OSX */
543 int dlil_lladdr_ckreq = 0;
544 #endif /* XNU_TARGET_OS_OSX */
545 #endif /* CONFIG_MACF */
546 
547 #if DEBUG
548 int dlil_verbose = 1;
549 #else
550 int dlil_verbose = 0;
551 #endif /* DEBUG */
552 #if IFNET_INPUT_SANITY_CHK
553 /* sanity checking of input packet lists received */
554 static u_int32_t dlil_input_sanity_check = 0;
555 #endif /* IFNET_INPUT_SANITY_CHK */
556 /* rate limit debug messages */
557 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
558 
559 SYSCTL_DECL(_net_link_generic_system);
560 
561 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
562     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
563 
564 #define IF_SNDQ_MINLEN  32
565 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
566 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
567     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
568     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
569 
570 #define IF_RCVQ_MINLEN  32
571 #define IF_RCVQ_MAXLEN  256
572 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
573 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
574     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
575     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
576 
577 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
578 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
579 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
580     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
581     "ilog2 of EWMA decay rate of avg inbound packets");
582 
583 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
584 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
585 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
586 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
587     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
588     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
589     "Q", "input poll mode freeze time");
590 
591 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
592 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
593 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
595     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
596     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
597     "Q", "input poll sampling time");
598 
599 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
600 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
601     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
602     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
603     "Q", "input poll interval (time)");
604 
605 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
606 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
607 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
608     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
609     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
610 
611 #define IF_RXPOLL_WLOWAT        10
612 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
613 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
614     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
615     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
616     "I", "input poll wakeup low watermark");
617 
618 #define IF_RXPOLL_WHIWAT        100
619 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
620 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
621     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
622     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
623     "I", "input poll wakeup high watermark");
624 
625 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
626 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
627     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
628     "max packets per poll call");
629 
630 u_int32_t if_rxpoll = 1;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
632     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
633     sysctl_rxpoll, "I", "enable opportunistic input polling");
634 
635 #if TEST_INPUT_THREAD_TERMINATION
636 static u_int32_t if_input_thread_termination_spin = 0;
637 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
638     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
639     &if_input_thread_termination_spin, 0,
640     sysctl_input_thread_termination_spin,
641     "I", "input thread termination spin limit");
642 #endif /* TEST_INPUT_THREAD_TERMINATION */
643 
644 static u_int32_t cur_dlil_input_threads = 0;
645 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
646     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
647     "Current number of DLIL input threads");
648 
649 #if IFNET_INPUT_SANITY_CHK
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
651     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
652     "Turn on sanity checking in DLIL input");
653 #endif /* IFNET_INPUT_SANITY_CHK */
654 
655 static u_int32_t if_flowadv = 1;
656 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
657     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
658     "enable flow-advisory mechanism");
659 
660 static u_int32_t if_delaybased_queue = 1;
661 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
662     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
663     "enable delay based dynamic queue sizing");
664 
665 static uint64_t hwcksum_in_invalidated = 0;
666 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
667     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
668     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
669 
670 uint32_t hwcksum_dbg = 0;
671 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
672     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
673     "enable hardware cksum debugging");
674 
675 u_int32_t ifnet_start_delayed = 0;
676 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
677     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
678     "number of times start was delayed");
679 
680 u_int32_t ifnet_delay_start_disabled = 0;
681 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
682     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
683     "number of times start was delayed");
684 
685 #if DEVELOPMENT || DEBUG
686 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
687 
688 struct flow_key flow_key_trace;
689 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
690     CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
691 #endif /* DEVELOPMENT || DEBUG */
692 
693 static inline void
ifnet_delay_start_disabled_increment(void)694 ifnet_delay_start_disabled_increment(void)
695 {
696 	OSIncrementAtomic(&ifnet_delay_start_disabled);
697 }
698 
699 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
700 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
701 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
702 #define HWCKSUM_DBG_MASK \
703 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
704 	HWCKSUM_DBG_FINALIZE_FORCED)
705 
706 static uint32_t hwcksum_dbg_mode = 0;
707 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
708     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
709     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
710 
711 static uint64_t hwcksum_dbg_partial_forced = 0;
712 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
713     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
714     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
715 
716 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
717 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
718     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
719     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
720 
721 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
722 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
723     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
724     &hwcksum_dbg_partial_rxoff_forced, 0,
725     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
726     "forced partial cksum rx offset");
727 
728 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
729 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
730     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
731     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
732     "adjusted partial cksum rx offset");
733 
734 static uint64_t hwcksum_dbg_verified = 0;
735 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
736     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
737     &hwcksum_dbg_verified, "packets verified for having good checksum");
738 
739 static uint64_t hwcksum_dbg_bad_cksum = 0;
740 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
741     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
742     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
743 
744 static uint64_t hwcksum_dbg_bad_rxoff = 0;
745 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
746     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
747     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
748 
749 static uint64_t hwcksum_dbg_adjusted = 0;
750 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
751     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
752     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
753 
754 static uint64_t hwcksum_dbg_finalized_hdr = 0;
755 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
756     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
757     &hwcksum_dbg_finalized_hdr, "finalized headers");
758 
759 static uint64_t hwcksum_dbg_finalized_data = 0;
760 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
761     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
762     &hwcksum_dbg_finalized_data, "finalized payloads");
763 
764 uint32_t hwcksum_tx = 1;
765 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
766     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
767     "enable transmit hardware checksum offload");
768 
769 uint32_t hwcksum_rx = 1;
770 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
771     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
772     "enable receive hardware checksum offload");
773 
774 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
775     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
776     sysctl_tx_chain_len_stats, "S", "");
777 
778 uint32_t tx_chain_len_count = 0;
779 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
780     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
781 
782 static uint32_t threshold_notify = 1;           /* enable/disable */
783 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
784     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
785 
786 static uint32_t threshold_interval = 2;         /* in seconds */
787 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
788     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
789 
790 #if (DEVELOPMENT || DEBUG)
791 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
792 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
793     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
794 #endif /* DEVELOPMENT || DEBUG */
795 
796 struct net_api_stats net_api_stats;
797 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
798     &net_api_stats, net_api_stats, "");
799 
800 uint32_t net_wake_pkt_debug = 0;
801 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
802     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
803 
804 static void log_hexdump(void *data, size_t len);
805 
806 unsigned int net_rxpoll = 1;
807 unsigned int net_affinity = 1;
808 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
809 
810 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
811 
812 extern u_int32_t        inject_buckets;
813 
814 /* DLIL data threshold thread call */
815 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
816 
817 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)818 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
819 {
820 	/*
821 	 * update filter count and route_generation ID to let TCP
822 	 * know it should reevalute doing TSO or not
823 	 */
824 	if (filter_enable) {
825 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
826 	} else {
827 		VERIFY(ifp->if_flt_no_tso_count != 0);
828 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
829 	}
830 	routegenid_update();
831 }
832 
833 #if SKYWALK
834 
835 #if defined(XNU_TARGET_OS_OSX)
836 static bool net_check_compatible_if_filter(struct ifnet *ifp);
837 #endif /* XNU_TARGET_OS_OSX */
838 
839 /* if_attach_nx flags defined in os_skywalk_private.h */
840 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
841 unsigned int if_enable_fsw_ip_netagent =
842     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
843 unsigned int if_enable_fsw_transport_netagent =
844     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
845 
846 unsigned int if_netif_all =
847     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
848 
849 /* Configure flowswitch to use max mtu sized buffer */
850 static bool fsw_use_max_mtu_buffer = false;
851 
852 #if (DEVELOPMENT || DEBUG)
853 static int
854 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
855 {
856 #pragma unused(oidp, arg1, arg2)
857 	unsigned int new_value;
858 	int changed;
859 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
860 	    &new_value, &changed);
861 	if (error) {
862 		return error;
863 	}
864 	if (changed) {
865 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
866 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
867 			return ENOTSUP;
868 		}
869 		if_attach_nx = new_value;
870 	}
871 	return 0;
872 }
873 
874 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
875     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
876     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
877 
878 #endif /* DEVELOPMENT || DEBUG */
879 
880 static int
881 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
882 {
883 #pragma unused(oidp, arg1, arg2)
884 	unsigned int new_value;
885 	int changed;
886 	int error;
887 
888 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
889 	    sizeof(if_enable_fsw_transport_netagent),
890 	    &new_value, &changed);
891 	if (error == 0 && changed != 0) {
892 		if (new_value != 0 && new_value != 1) {
893 			/* only allow 0 or 1 */
894 			error = EINVAL;
895 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
896 			/* netagent can be enabled/disabled */
897 			if_enable_fsw_transport_netagent = new_value;
898 			if (new_value == 0) {
899 				kern_nexus_deregister_netagents();
900 			} else {
901 				kern_nexus_register_netagents();
902 			}
903 		} else {
904 			/* netagent can't be enabled */
905 			error = ENOTSUP;
906 		}
907 	}
908 	return error;
909 }
910 
911 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
912     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
913     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
914     "enable flowswitch netagent");
915 
916 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
917 
918 #include <skywalk/os_skywalk_private.h>
919 
920 boolean_t
ifnet_nx_noauto(ifnet_t ifp)921 ifnet_nx_noauto(ifnet_t ifp)
922 {
923 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
924 }
925 
926 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)927 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
928 {
929 	return ifnet_is_low_latency(ifp);
930 }
931 
932 boolean_t
ifnet_is_low_latency(ifnet_t ifp)933 ifnet_is_low_latency(ifnet_t ifp)
934 {
935 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
936 }
937 
938 boolean_t
ifnet_needs_compat(ifnet_t ifp)939 ifnet_needs_compat(ifnet_t ifp)
940 {
941 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
942 		return FALSE;
943 	}
944 #if !XNU_TARGET_OS_OSX
945 	/*
946 	 * To conserve memory, we plumb in the compat layer selectively; this
947 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
948 	 * In particular, we check for Wi-Fi Access Point.
949 	 */
950 	if (IFNET_IS_WIFI(ifp)) {
951 		/* Wi-Fi Access Point */
952 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
953 		    ifp->if_name[2] == '\0') {
954 			return if_netif_all;
955 		}
956 	}
957 #else /* XNU_TARGET_OS_OSX */
958 #pragma unused(ifp)
959 #endif /* XNU_TARGET_OS_OSX */
960 	return TRUE;
961 }
962 
963 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)964 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
965 {
966 	if (if_is_fsw_transport_netagent_enabled()) {
967 		/* check if netagent has been manually enabled for ipsec/utun */
968 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
969 			return ipsec_interface_needs_netagent(ifp);
970 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
971 			return utun_interface_needs_netagent(ifp);
972 		}
973 
974 		/* check ifnet no auto nexus override */
975 		if (ifnet_nx_noauto(ifp)) {
976 			return FALSE;
977 		}
978 
979 		/* check global if_attach_nx configuration */
980 		switch (ifp->if_family) {
981 		case IFNET_FAMILY_CELLULAR:
982 		case IFNET_FAMILY_ETHERNET:
983 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
984 				return TRUE;
985 			}
986 			break;
987 		default:
988 			break;
989 		}
990 	}
991 	return FALSE;
992 }
993 
994 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)995 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
996 {
997 #pragma unused(ifp)
998 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
999 		return TRUE;
1000 	}
1001 	return FALSE;
1002 }
1003 
1004 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1005 ifnet_needs_netif_netagent(ifnet_t ifp)
1006 {
1007 #pragma unused(ifp)
1008 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1009 }
1010 
1011 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1012 dlil_detach_nexus_instance(nexus_controller_t controller,
1013     const char *func_str, uuid_t instance, uuid_t device)
1014 {
1015 	errno_t         err;
1016 
1017 	if (instance == NULL || uuid_is_null(instance)) {
1018 		return FALSE;
1019 	}
1020 
1021 	/* followed by the device port */
1022 	if (device != NULL && !uuid_is_null(device)) {
1023 		err = kern_nexus_ifdetach(controller, instance, device);
1024 		if (err != 0) {
1025 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1026 			    func_str, err);
1027 		}
1028 	}
1029 	err = kern_nexus_controller_free_provider_instance(controller,
1030 	    instance);
1031 	if (err != 0) {
1032 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1033 		    func_str, err);
1034 	}
1035 	return TRUE;
1036 }
1037 
1038 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1039 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1040     uuid_t device)
1041 {
1042 	boolean_t               detached = FALSE;
1043 	nexus_controller_t      controller = kern_nexus_shared_controller();
1044 	int                     err;
1045 
1046 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1047 	    device)) {
1048 		detached = TRUE;
1049 	}
1050 	if (provider != NULL && !uuid_is_null(provider)) {
1051 		detached = TRUE;
1052 		err = kern_nexus_controller_deregister_provider(controller,
1053 		    provider);
1054 		if (err != 0) {
1055 			DLIL_PRINTF("%s deregister_provider %d\n",
1056 			    func_str, err);
1057 		}
1058 	}
1059 	return detached;
1060 }
1061 
1062 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1063 dlil_create_provider_and_instance(nexus_controller_t controller,
1064     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1065     nexus_attr_t attr)
1066 {
1067 	uuid_t          dom_prov;
1068 	errno_t         err;
1069 	nexus_name_t    provider_name;
1070 	const char      *type_name =
1071 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1072 	struct kern_nexus_init init;
1073 
1074 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1075 	if (err != 0) {
1076 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1077 		    __func__, type_name, err);
1078 		goto failed;
1079 	}
1080 
1081 	snprintf((char *)provider_name, sizeof(provider_name),
1082 	    "com.apple.%s.%s", type_name, if_name(ifp));
1083 	err = kern_nexus_controller_register_provider(controller,
1084 	    dom_prov,
1085 	    provider_name,
1086 	    NULL,
1087 	    0,
1088 	    attr,
1089 	    provider);
1090 	if (err != 0) {
1091 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1092 		    __func__, type_name, err);
1093 		goto failed;
1094 	}
1095 	bzero(&init, sizeof(init));
1096 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1097 	err = kern_nexus_controller_alloc_provider_instance(controller,
1098 	    *provider,
1099 	    NULL, NULL,
1100 	    instance, &init);
1101 	if (err != 0) {
1102 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1103 		    __func__, type_name, err);
1104 		kern_nexus_controller_deregister_provider(controller,
1105 		    *provider);
1106 		goto failed;
1107 	}
1108 failed:
1109 	return err;
1110 }
1111 
1112 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1113 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1114 {
1115 	nexus_attr_t            attr = NULL;
1116 	nexus_controller_t      controller;
1117 	errno_t                 err;
1118 
1119 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1120 		/* it's already attached */
1121 		if (dlil_verbose) {
1122 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1123 			    __func__, if_name(ifp));
1124 			/* already attached */
1125 		}
1126 		goto failed;
1127 	}
1128 
1129 	err = kern_nexus_attr_create(&attr);
1130 	if (err != 0) {
1131 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1132 		    if_name(ifp));
1133 		goto failed;
1134 	}
1135 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1136 	VERIFY(err == 0);
1137 
1138 	controller = kern_nexus_shared_controller();
1139 
1140 	/* create the netif provider and instance */
1141 	err = dlil_create_provider_and_instance(controller,
1142 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1143 	    &netif_nx->if_nif_instance, attr);
1144 	if (err != 0) {
1145 		goto failed;
1146 	}
1147 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1148 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1149 	if (err != 0) {
1150 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1151 		    __func__, err);
1152 		/* cleanup provider and instance */
1153 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1154 		    netif_nx->if_nif_instance, NULL);
1155 		goto failed;
1156 	}
1157 	return TRUE;
1158 
1159 failed:
1160 	if (attr != NULL) {
1161 		kern_nexus_attr_destroy(attr);
1162 	}
1163 	return FALSE;
1164 }
1165 
1166 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1167 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1168 {
1169 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1170 	    IFNET_IS_VMNET(ifp)) {
1171 		goto failed;
1172 	}
1173 	switch (ifp->if_type) {
1174 	case IFT_CELLULAR:
1175 	case IFT_ETHER:
1176 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1177 			/* don't auto-attach */
1178 			goto failed;
1179 		}
1180 		break;
1181 	default:
1182 		/* don't auto-attach */
1183 		goto failed;
1184 	}
1185 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1186 
1187 failed:
1188 	return FALSE;
1189 }
1190 
1191 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1192 dlil_is_native_netif_nexus(ifnet_t ifp)
1193 {
1194 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1195 }
1196 
1197 __attribute__((noinline))
1198 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1199 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1200 {
1201 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1202 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1203 }
1204 
1205 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1206 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1207 {
1208 	struct ifreq        ifr;
1209 	int                 error;
1210 
1211 	bzero(&ifr, sizeof(ifr));
1212 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1213 	if (error == 0) {
1214 		*ifdm_p = ifr.ifr_devmtu;
1215 	}
1216 	return error;
1217 }
1218 
1219 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1220 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1221     bool *use_multi_buflet, uint32_t *large_buf_size)
1222 {
1223 	struct kern_pbufpool_memory_info rx_pp_info;
1224 	struct kern_pbufpool_memory_info tx_pp_info;
1225 	uint32_t if_max_mtu = 0;
1226 	uint32_t drv_buf_size;
1227 	struct ifdevmtu ifdm;
1228 	int err;
1229 
1230 	/*
1231 	 * To perform intra-stack RX aggregation flowswitch needs to use
1232 	 * multi-buflet packet.
1233 	 */
1234 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1235 
1236 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1237 	/*
1238 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1239 	 * but the driver advertises the MAX MTU as only 9K.
1240 	 */
1241 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1242 		if_max_mtu = IP_MAXPACKET;
1243 		goto skip_mtu_ioctl;
1244 	}
1245 
1246 	/* determine max mtu */
1247 	bzero(&ifdm, sizeof(ifdm));
1248 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1249 	if (__improbable(err != 0)) {
1250 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1251 		    __func__, if_name(ifp));
1252 		/* use default flowswitch buffer size */
1253 		if_max_mtu = NX_FSW_BUFSIZE;
1254 	} else {
1255 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1256 		    ifdm.ifdm_max, ifdm.ifdm_current);
1257 		/* rdar://problem/44589731 */
1258 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1259 	}
1260 
1261 skip_mtu_ioctl:
1262 	if (if_max_mtu == 0) {
1263 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1264 		    __func__, if_name(ifp));
1265 		return EINVAL;
1266 	}
1267 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1268 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1269 		    "max bufsize(%d)\n", __func__,
1270 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1271 		return EINVAL;
1272 	}
1273 
1274 	/*
1275 	 * for skywalk native driver, consult the driver packet pool also.
1276 	 */
1277 	if (dlil_is_native_netif_nexus(ifp)) {
1278 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1279 		    &tx_pp_info);
1280 		if (err != 0) {
1281 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1282 			    __func__, if_name(ifp));
1283 			return ENXIO;
1284 		}
1285 		drv_buf_size = tx_pp_info.kpm_bufsize *
1286 		    tx_pp_info.kpm_max_frags;
1287 		if (if_max_mtu > drv_buf_size) {
1288 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1289 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1290 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1291 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1292 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1293 			return EINVAL;
1294 		}
1295 	} else {
1296 		drv_buf_size = if_max_mtu;
1297 	}
1298 
1299 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1300 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1301 		*use_multi_buflet = true;
1302 		/* default flowswitch buffer size */
1303 		*buf_size = NX_FSW_BUFSIZE;
1304 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1305 	} else {
1306 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1307 	}
1308 
1309 	/*
1310 	 * if HW TSO is enabled on a Skywalk native interface then make
1311 	 * the flowswitch default buffer be able to handle max TSO segment.
1312 	 */
1313 	uint32_t tso_v4_mtu = 0;
1314 	uint32_t tso_v6_mtu = 0;
1315 #ifdef XNU_TARGET_OS_OSX
1316 	if (dlil_is_native_netif_nexus(ifp)) {
1317 		if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1318 			tso_v4_mtu = ifp->if_tso_v4_mtu;
1319 		}
1320 		if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1321 			tso_v6_mtu = ifp->if_tso_v6_mtu;
1322 		}
1323 	}
1324 #endif /* XNU_TARGET_OS_OSX */
1325 	if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1326 		*buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1327 		ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1328 	}
1329 	if (*buf_size >= *large_buf_size) {
1330 		*large_buf_size = 0;
1331 	}
1332 	return 0;
1333 }
1334 
1335 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1336 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1337 {
1338 	nexus_attr_t            attr = NULL;
1339 	nexus_controller_t      controller;
1340 	errno_t                 err = 0;
1341 	uuid_t                  netif;
1342 	uint32_t                buf_size = 0;
1343 	uint32_t                large_buf_size = 0;
1344 	bool                    multi_buflet;
1345 
1346 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1347 	    IFNET_IS_VMNET(ifp)) {
1348 		goto failed;
1349 	}
1350 
1351 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1352 		/* not possible to attach (netif native/compat not plumbed) */
1353 		goto failed;
1354 	}
1355 
1356 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1357 		/* don't auto-attach */
1358 		goto failed;
1359 	}
1360 
1361 	/* get the netif instance from the ifp */
1362 	err = kern_nexus_get_netif_instance(ifp, netif);
1363 	if (err != 0) {
1364 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1365 		    if_name(ifp));
1366 		goto failed;
1367 	}
1368 
1369 	err = kern_nexus_attr_create(&attr);
1370 	if (err != 0) {
1371 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1372 		    if_name(ifp));
1373 		goto failed;
1374 	}
1375 
1376 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1377 	    &multi_buflet, &large_buf_size);
1378 	if (err != 0) {
1379 		goto failed;
1380 	}
1381 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1382 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1383 
1384 	/* Configure flowswitch buffer size */
1385 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1386 	VERIFY(err == 0);
1387 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1388 	    large_buf_size);
1389 	VERIFY(err == 0);
1390 
1391 	/*
1392 	 * Configure flowswitch to use super-packet (multi-buflet).
1393 	 */
1394 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1395 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1396 	VERIFY(err == 0);
1397 
1398 	/* create the flowswitch provider and instance */
1399 	controller = kern_nexus_shared_controller();
1400 	err = dlil_create_provider_and_instance(controller,
1401 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1402 	    &nexus_fsw->if_fsw_instance, attr);
1403 	if (err != 0) {
1404 		goto failed;
1405 	}
1406 
1407 	/* attach the device port */
1408 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1409 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1410 	if (err != 0) {
1411 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1412 		    __func__, err, if_name(ifp));
1413 		/* cleanup provider and instance */
1414 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1415 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1416 		goto failed;
1417 	}
1418 	return TRUE;
1419 
1420 failed:
1421 	if (err != 0) {
1422 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1423 		    __func__, if_name(ifp), err);
1424 	} else {
1425 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1426 		    __func__, if_name(ifp));
1427 	}
1428 	if (attr != NULL) {
1429 		kern_nexus_attr_destroy(attr);
1430 	}
1431 	return FALSE;
1432 }
1433 
1434 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1435 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1436 {
1437 	boolean_t               attached;
1438 	if_nexus_flowswitch     nexus_fsw;
1439 
1440 #if (DEVELOPMENT || DEBUG)
1441 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1442 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1443 		return FALSE;
1444 	}
1445 #endif /* (DEVELOPMENT || DEBUG) */
1446 
1447 	/*
1448 	 * flowswitch attachment is not supported for interface using the
1449 	 * legacy model (IFNET_INIT_LEGACY)
1450 	 */
1451 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1452 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1453 		    if_name(ifp));
1454 		return FALSE;
1455 	}
1456 
1457 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1458 		/* it's already attached */
1459 		return FALSE;
1460 	}
1461 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1462 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1463 	if (attached) {
1464 		ifnet_lock_exclusive(ifp);
1465 		if (!IF_FULLY_ATTACHED(ifp)) {
1466 			/* interface is going away */
1467 			attached = FALSE;
1468 		} else {
1469 			ifp->if_nx_flowswitch = nexus_fsw;
1470 		}
1471 		ifnet_lock_done(ifp);
1472 		if (!attached) {
1473 			/* clean up flowswitch nexus */
1474 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1475 		}
1476 	}
1477 	return attached;
1478 }
1479 
1480 __attribute__((noinline))
1481 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1482 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1483 {
1484 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1485 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1486 }
1487 
1488 __attribute__((noinline))
1489 static void
dlil_netif_detach_notify(ifnet_t ifp)1490 dlil_netif_detach_notify(ifnet_t ifp)
1491 {
1492 	void (*detach_notify)(struct nexus_netif_adapter *);
1493 
1494 	/*
1495 	 * This is only needed for low latency interfaces for now.
1496 	 */
1497 	if (!ifnet_is_low_latency(ifp)) {
1498 		return;
1499 	}
1500 	detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1501 	if (detach_notify != NULL) {
1502 		(*detach_notify)(ifp->if_na);
1503 	} else {
1504 		DLIL_PRINTF("%s: %s has no detach notify calback\n",
1505 		    __func__, if_name(ifp));
1506 	}
1507 }
1508 
1509 __attribute__((noinline))
1510 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1511 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1512 {
1513 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1514 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1515 
1516 	ifnet_datamov_suspend_and_drain(ifp);
1517 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1518 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1519 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1520 		dlil_detach_flowswitch_nexus(nx_fsw);
1521 		bzero(nx_fsw, sizeof(*nx_fsw));
1522 	} else {
1523 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1524 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1525 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1526 	}
1527 
1528 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1529 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1530 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1531 		dlil_detach_netif_nexus(nx_netif);
1532 		bzero(nx_netif, sizeof(*nx_netif));
1533 	} else {
1534 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1535 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1536 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1537 	}
1538 	ifnet_datamov_resume(ifp);
1539 }
1540 
1541 boolean_t
ifnet_add_netagent(ifnet_t ifp)1542 ifnet_add_netagent(ifnet_t ifp)
1543 {
1544 	int     error;
1545 
1546 	error = kern_nexus_interface_add_netagent(ifp);
1547 	os_log(OS_LOG_DEFAULT,
1548 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1549 	    ifp->if_xname, error);
1550 	return error == 0;
1551 }
1552 
1553 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1554 ifnet_remove_netagent(ifnet_t ifp)
1555 {
1556 	int     error;
1557 
1558 	error = kern_nexus_interface_remove_netagent(ifp);
1559 	os_log(OS_LOG_DEFAULT,
1560 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1561 	    ifp->if_xname, error);
1562 	return error == 0;
1563 }
1564 
1565 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1566 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1567 {
1568 	if (!IF_FULLY_ATTACHED(ifp)) {
1569 		return FALSE;
1570 	}
1571 	return dlil_attach_flowswitch_nexus(ifp);
1572 }
1573 
1574 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1575 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1576 {
1577 	if_nexus_flowswitch     nexus_fsw;
1578 
1579 	ifnet_lock_exclusive(ifp);
1580 	nexus_fsw = ifp->if_nx_flowswitch;
1581 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1582 	ifnet_lock_done(ifp);
1583 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1584 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1585 }
1586 
1587 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1588 ifnet_attach_netif_nexus(ifnet_t ifp)
1589 {
1590 	boolean_t       nexus_attached;
1591 	if_nexus_netif  nexus_netif;
1592 
1593 	if (!IF_FULLY_ATTACHED(ifp)) {
1594 		return FALSE;
1595 	}
1596 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1597 	if (nexus_attached) {
1598 		ifnet_lock_exclusive(ifp);
1599 		ifp->if_nx_netif = nexus_netif;
1600 		ifnet_lock_done(ifp);
1601 	}
1602 	return nexus_attached;
1603 }
1604 
1605 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1606 ifnet_detach_netif_nexus(ifnet_t ifp)
1607 {
1608 	if_nexus_netif  nexus_netif;
1609 
1610 	ifnet_lock_exclusive(ifp);
1611 	nexus_netif = ifp->if_nx_netif;
1612 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1613 	ifnet_lock_done(ifp);
1614 
1615 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1616 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1617 }
1618 
1619 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1620 ifnet_attach_native_flowswitch(ifnet_t ifp)
1621 {
1622 	if (!dlil_is_native_netif_nexus(ifp)) {
1623 		/* not a native netif */
1624 		return;
1625 	}
1626 	ifnet_attach_flowswitch_nexus(ifp);
1627 }
1628 
1629 #endif /* SKYWALK */
1630 
1631 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1632 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1633 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1634 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1635 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1636 	/* NOTREACHED */                                        \
1637 	}                                                               \
1638 }
1639 
1640 #define DLIL_EWMA(old, new, decay) do {                                 \
1641 	u_int32_t _avg;                                                 \
1642 	if ((_avg = (old)) > 0)                                         \
1643 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1644 	else                                                            \
1645 	        _avg = (new);                                           \
1646 	(old) = _avg;                                                   \
1647 } while (0)
1648 
1649 #define MBPS    (1ULL * 1000 * 1000)
1650 #define GBPS    (MBPS * 1000)
1651 
1652 struct rxpoll_time_tbl {
1653 	u_int64_t       speed;          /* downlink speed */
1654 	u_int32_t       plowat;         /* packets low watermark */
1655 	u_int32_t       phiwat;         /* packets high watermark */
1656 	u_int32_t       blowat;         /* bytes low watermark */
1657 	u_int32_t       bhiwat;         /* bytes high watermark */
1658 };
1659 
1660 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1661 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1662 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1663 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1664 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1665 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1666 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1667 };
1668 
1669 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1670     &dlil_lck_attributes);
1671 static uint32_t dlil_pending_thread_cnt = 0;
1672 
1673 static void
dlil_incr_pending_thread_count(void)1674 dlil_incr_pending_thread_count(void)
1675 {
1676 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1677 	lck_mtx_lock(&dlil_thread_sync_lock);
1678 	dlil_pending_thread_cnt++;
1679 	lck_mtx_unlock(&dlil_thread_sync_lock);
1680 }
1681 
1682 static void
dlil_decr_pending_thread_count(void)1683 dlil_decr_pending_thread_count(void)
1684 {
1685 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1686 	lck_mtx_lock(&dlil_thread_sync_lock);
1687 	VERIFY(dlil_pending_thread_cnt > 0);
1688 	dlil_pending_thread_cnt--;
1689 	if (dlil_pending_thread_cnt == 0) {
1690 		wakeup(&dlil_pending_thread_cnt);
1691 	}
1692 	lck_mtx_unlock(&dlil_thread_sync_lock);
1693 }
1694 
1695 int
proto_hash_value(u_int32_t protocol_family)1696 proto_hash_value(u_int32_t protocol_family)
1697 {
1698 	/*
1699 	 * dlil_proto_unplumb_all() depends on the mapping between
1700 	 * the hash bucket index and the protocol family defined
1701 	 * here; future changes must be applied there as well.
1702 	 */
1703 	switch (protocol_family) {
1704 	case PF_INET:
1705 		return 0;
1706 	case PF_INET6:
1707 		return 1;
1708 	case PF_VLAN:
1709 		return 2;
1710 	case PF_UNSPEC:
1711 	default:
1712 		return 3;
1713 	}
1714 }
1715 
1716 /*
1717  * Caller must already be holding ifnet lock.
1718  */
1719 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1720 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1721 {
1722 	struct if_proto *proto = NULL;
1723 	u_int32_t i = proto_hash_value(protocol_family);
1724 
1725 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1726 
1727 	if (ifp->if_proto_hash != NULL) {
1728 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1729 	}
1730 
1731 	while (proto != NULL && proto->protocol_family != protocol_family) {
1732 		proto = SLIST_NEXT(proto, next_hash);
1733 	}
1734 
1735 	if (proto != NULL) {
1736 		if_proto_ref(proto);
1737 	}
1738 
1739 	return proto;
1740 }
1741 
1742 static void
if_proto_ref(struct if_proto * proto)1743 if_proto_ref(struct if_proto *proto)
1744 {
1745 	atomic_add_32(&proto->refcount, 1);
1746 }
1747 
1748 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1749 
1750 static void
if_proto_free(struct if_proto * proto)1751 if_proto_free(struct if_proto *proto)
1752 {
1753 	u_int32_t oldval;
1754 	struct ifnet *ifp = proto->ifp;
1755 	u_int32_t proto_family = proto->protocol_family;
1756 	struct kev_dl_proto_data ev_pr_data;
1757 
1758 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1759 	if (oldval > 1) {
1760 		return;
1761 	}
1762 
1763 	if (proto->proto_kpi == kProtoKPI_v1) {
1764 		if (proto->kpi.v1.detached) {
1765 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1766 		}
1767 	}
1768 	if (proto->proto_kpi == kProtoKPI_v2) {
1769 		if (proto->kpi.v2.detached) {
1770 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1771 		}
1772 	}
1773 
1774 	/*
1775 	 * Cleanup routes that may still be in the routing table for that
1776 	 * interface/protocol pair.
1777 	 */
1778 	if_rtproto_del(ifp, proto_family);
1779 
1780 	ifnet_lock_shared(ifp);
1781 
1782 	/* No more reference on this, protocol must have been detached */
1783 	VERIFY(proto->detached);
1784 
1785 	/*
1786 	 * The reserved field carries the number of protocol still attached
1787 	 * (subject to change)
1788 	 */
1789 	ev_pr_data.proto_family = proto_family;
1790 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1791 
1792 	ifnet_lock_done(ifp);
1793 
1794 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1795 	    (struct net_event_data *)&ev_pr_data,
1796 	    sizeof(struct kev_dl_proto_data), FALSE);
1797 
1798 	if (ev_pr_data.proto_remaining_count == 0) {
1799 		/*
1800 		 * The protocol count has gone to zero, mark the interface down.
1801 		 * This used to be done by configd.KernelEventMonitor, but that
1802 		 * is inherently prone to races (rdar://problem/30810208).
1803 		 */
1804 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1805 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1806 		dlil_post_sifflags_msg(ifp);
1807 	}
1808 
1809 	zfree(dlif_proto_zone, proto);
1810 }
1811 
1812 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1813 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1814 {
1815 #if !MACH_ASSERT
1816 #pragma unused(ifp)
1817 #endif
1818 	unsigned int type = 0;
1819 	int ass = 1;
1820 
1821 	switch (what) {
1822 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1823 		type = LCK_RW_ASSERT_EXCLUSIVE;
1824 		break;
1825 
1826 	case IFNET_LCK_ASSERT_SHARED:
1827 		type = LCK_RW_ASSERT_SHARED;
1828 		break;
1829 
1830 	case IFNET_LCK_ASSERT_OWNED:
1831 		type = LCK_RW_ASSERT_HELD;
1832 		break;
1833 
1834 	case IFNET_LCK_ASSERT_NOTOWNED:
1835 		/* nothing to do here for RW lock; bypass assert */
1836 		ass = 0;
1837 		break;
1838 
1839 	default:
1840 		panic("bad ifnet assert type: %d", what);
1841 		/* NOTREACHED */
1842 	}
1843 	if (ass) {
1844 		LCK_RW_ASSERT(&ifp->if_lock, type);
1845 	}
1846 }
1847 
1848 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1849 ifnet_lock_shared(struct ifnet *ifp)
1850 {
1851 	lck_rw_lock_shared(&ifp->if_lock);
1852 }
1853 
1854 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1855 ifnet_lock_exclusive(struct ifnet *ifp)
1856 {
1857 	lck_rw_lock_exclusive(&ifp->if_lock);
1858 }
1859 
1860 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1861 ifnet_lock_done(struct ifnet *ifp)
1862 {
1863 	lck_rw_done(&ifp->if_lock);
1864 }
1865 
1866 #if INET
1867 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1868 if_inetdata_lock_shared(struct ifnet *ifp)
1869 {
1870 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1871 }
1872 
1873 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1874 if_inetdata_lock_exclusive(struct ifnet *ifp)
1875 {
1876 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1877 }
1878 
1879 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1880 if_inetdata_lock_done(struct ifnet *ifp)
1881 {
1882 	lck_rw_done(&ifp->if_inetdata_lock);
1883 }
1884 #endif
1885 
1886 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1887 if_inet6data_lock_shared(struct ifnet *ifp)
1888 {
1889 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1890 }
1891 
1892 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1893 if_inet6data_lock_exclusive(struct ifnet *ifp)
1894 {
1895 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1896 }
1897 
1898 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1899 if_inet6data_lock_done(struct ifnet *ifp)
1900 {
1901 	lck_rw_done(&ifp->if_inet6data_lock);
1902 }
1903 
1904 __private_extern__ void
ifnet_head_lock_shared(void)1905 ifnet_head_lock_shared(void)
1906 {
1907 	lck_rw_lock_shared(&ifnet_head_lock);
1908 }
1909 
1910 __private_extern__ void
ifnet_head_lock_exclusive(void)1911 ifnet_head_lock_exclusive(void)
1912 {
1913 	lck_rw_lock_exclusive(&ifnet_head_lock);
1914 }
1915 
1916 __private_extern__ void
ifnet_head_done(void)1917 ifnet_head_done(void)
1918 {
1919 	lck_rw_done(&ifnet_head_lock);
1920 }
1921 
1922 __private_extern__ void
ifnet_head_assert_exclusive(void)1923 ifnet_head_assert_exclusive(void)
1924 {
1925 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1926 }
1927 
1928 /*
1929  * dlil_ifp_protolist
1930  * - get the list of protocols attached to the interface, or just the number
1931  *   of attached protocols
1932  * - if the number returned is greater than 'list_count', truncation occurred
1933  *
1934  * Note:
1935  * - caller must already be holding ifnet lock.
1936  */
1937 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1938 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1939     u_int32_t list_count)
1940 {
1941 	u_int32_t       count = 0;
1942 	int             i;
1943 
1944 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1945 
1946 	if (ifp->if_proto_hash == NULL) {
1947 		goto done;
1948 	}
1949 
1950 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1951 		struct if_proto *proto;
1952 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1953 			if (list != NULL && count < list_count) {
1954 				list[count] = proto->protocol_family;
1955 			}
1956 			count++;
1957 		}
1958 	}
1959 done:
1960 	return count;
1961 }
1962 
1963 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1964 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1965 {
1966 	ifnet_lock_shared(ifp);
1967 	count = dlil_ifp_protolist(ifp, protolist, count);
1968 	ifnet_lock_done(ifp);
1969 	return count;
1970 }
1971 
1972 __private_extern__ void
if_free_protolist(u_int32_t * list)1973 if_free_protolist(u_int32_t *list)
1974 {
1975 	kfree_data_addr(list);
1976 }
1977 
1978 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1979 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1980     u_int32_t event_code, struct net_event_data *event_data,
1981     u_int32_t event_data_len, boolean_t suppress_generation)
1982 {
1983 	struct net_event_data ev_data;
1984 	struct kev_msg ev_msg;
1985 
1986 	bzero(&ev_msg, sizeof(ev_msg));
1987 	bzero(&ev_data, sizeof(ev_data));
1988 	/*
1989 	 * a net event always starts with a net_event_data structure
1990 	 * but the caller can generate a simple net event or
1991 	 * provide a longer event structure to post
1992 	 */
1993 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1994 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1995 	ev_msg.kev_subclass     = event_subclass;
1996 	ev_msg.event_code       = event_code;
1997 
1998 	if (event_data == NULL) {
1999 		event_data = &ev_data;
2000 		event_data_len = sizeof(struct net_event_data);
2001 	}
2002 
2003 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2004 	event_data->if_family = ifp->if_family;
2005 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2006 
2007 	ev_msg.dv[0].data_length = event_data_len;
2008 	ev_msg.dv[0].data_ptr    = event_data;
2009 	ev_msg.dv[1].data_length = 0;
2010 
2011 	bool update_generation = true;
2012 	if (event_subclass == KEV_DL_SUBCLASS) {
2013 		/* Don't update interface generation for frequent link quality and state changes  */
2014 		switch (event_code) {
2015 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2016 		case KEV_DL_RRC_STATE_CHANGED:
2017 		case KEV_DL_PRIMARY_ELECTED:
2018 			update_generation = false;
2019 			break;
2020 		default:
2021 			break;
2022 		}
2023 	}
2024 
2025 	/*
2026 	 * Some events that update generation counts might
2027 	 * want to suppress generation count.
2028 	 * One example is node presence/absence where we still
2029 	 * issue kernel event for the invocation but want to avoid
2030 	 * expensive operation of updating generation which triggers
2031 	 * NECP client updates.
2032 	 */
2033 	if (suppress_generation) {
2034 		update_generation = false;
2035 	}
2036 
2037 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2038 }
2039 
2040 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2041 dlil_alloc_local_stats(struct ifnet *ifp)
2042 {
2043 	int ret = EINVAL;
2044 	void *buf, *base, **pbuf;
2045 
2046 	if (ifp == NULL) {
2047 		goto end;
2048 	}
2049 
2050 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2051 		/* allocate tcpstat_local structure */
2052 		buf = zalloc_flags(dlif_tcpstat_zone,
2053 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2054 
2055 		/* Get the 64-bit aligned base address for this object */
2056 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2057 		    sizeof(u_int64_t));
2058 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2059 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2060 
2061 		/*
2062 		 * Wind back a pointer size from the aligned base and
2063 		 * save the original address so we can free it later.
2064 		 */
2065 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2066 		*pbuf = buf;
2067 		ifp->if_tcp_stat = base;
2068 
2069 		/* allocate udpstat_local structure */
2070 		buf = zalloc_flags(dlif_udpstat_zone,
2071 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2072 
2073 		/* Get the 64-bit aligned base address for this object */
2074 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2075 		    sizeof(u_int64_t));
2076 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2077 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2078 
2079 		/*
2080 		 * Wind back a pointer size from the aligned base and
2081 		 * save the original address so we can free it later.
2082 		 */
2083 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2084 		*pbuf = buf;
2085 		ifp->if_udp_stat = base;
2086 
2087 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2088 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2089 
2090 		ret = 0;
2091 	}
2092 
2093 	if (ifp->if_ipv4_stat == NULL) {
2094 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2095 	}
2096 
2097 	if (ifp->if_ipv6_stat == NULL) {
2098 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2099 	}
2100 end:
2101 	if (ifp != NULL && ret != 0) {
2102 		if (ifp->if_tcp_stat != NULL) {
2103 			pbuf = (void **)
2104 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2105 			zfree(dlif_tcpstat_zone, *pbuf);
2106 			ifp->if_tcp_stat = NULL;
2107 		}
2108 		if (ifp->if_udp_stat != NULL) {
2109 			pbuf = (void **)
2110 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2111 			zfree(dlif_udpstat_zone, *pbuf);
2112 			ifp->if_udp_stat = NULL;
2113 		}
2114 		/* The macro kfree_type sets the passed pointer to NULL */
2115 		if (ifp->if_ipv4_stat != NULL) {
2116 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2117 		}
2118 		if (ifp->if_ipv6_stat != NULL) {
2119 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2120 		}
2121 	}
2122 
2123 	return ret;
2124 }
2125 
2126 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2127 dlil_reset_rxpoll_params(ifnet_t ifp)
2128 {
2129 	ASSERT(ifp != NULL);
2130 	ifnet_set_poll_cycle(ifp, NULL);
2131 	ifp->if_poll_update = 0;
2132 	ifp->if_poll_flags = 0;
2133 	ifp->if_poll_req = 0;
2134 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2135 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2136 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2137 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2138 	net_timerclear(&ifp->if_poll_mode_holdtime);
2139 	net_timerclear(&ifp->if_poll_mode_lasttime);
2140 	net_timerclear(&ifp->if_poll_sample_holdtime);
2141 	net_timerclear(&ifp->if_poll_sample_lasttime);
2142 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2143 }
2144 
2145 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2146 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2147     thread_continue_t *thfunc)
2148 {
2149 	boolean_t dlil_rxpoll_input;
2150 	thread_continue_t func = NULL;
2151 	u_int32_t limit;
2152 	int error = 0;
2153 
2154 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2155 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2156 
2157 	/* default strategy utilizes the DLIL worker thread */
2158 	inp->dlth_strategy = dlil_input_async;
2159 
2160 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2161 	if (ifp == NULL) {
2162 		/*
2163 		 * Main input thread only.
2164 		 */
2165 		func = dlil_main_input_thread_func;
2166 		VERIFY(inp == dlil_main_input_thread);
2167 		(void) strlcat(inp->dlth_name,
2168 		    "main_input", DLIL_THREADNAME_LEN);
2169 	} else if (dlil_rxpoll_input) {
2170 		/*
2171 		 * Legacy (non-netif) hybrid polling.
2172 		 */
2173 		func = dlil_rxpoll_input_thread_func;
2174 		VERIFY(inp != dlil_main_input_thread);
2175 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2176 		    "%s_input_poll", if_name(ifp));
2177 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2178 		/*
2179 		 * Asynchronous strategy.
2180 		 */
2181 		func = dlil_input_thread_func;
2182 		VERIFY(inp != dlil_main_input_thread);
2183 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2184 		    "%s_input", if_name(ifp));
2185 	} else {
2186 		/*
2187 		 * Synchronous strategy if there's a netif below and
2188 		 * the device isn't capable of hybrid polling.
2189 		 */
2190 		ASSERT(func == NULL);
2191 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2192 		VERIFY(inp != dlil_main_input_thread);
2193 		ASSERT(!inp->dlth_affinity);
2194 		inp->dlth_strategy = dlil_input_sync;
2195 	}
2196 	VERIFY(inp->dlth_thread == THREAD_NULL);
2197 
2198 	/* let caller know */
2199 	if (thfunc != NULL) {
2200 		*thfunc = func;
2201 	}
2202 
2203 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2204 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2205 
2206 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2207 	/*
2208 	 * For interfaces that support opportunistic polling, set the
2209 	 * low and high watermarks for outstanding inbound packets/bytes.
2210 	 * Also define freeze times for transitioning between modes
2211 	 * and updating the average.
2212 	 */
2213 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2214 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2215 		if (ifp->if_xflags & IFXF_LEGACY) {
2216 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2217 		}
2218 	} else {
2219 		limit = (u_int32_t)-1;
2220 	}
2221 
2222 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2223 	if (inp == dlil_main_input_thread) {
2224 		struct dlil_main_threading_info *inpm =
2225 		    (struct dlil_main_threading_info *)inp;
2226 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2227 	}
2228 
2229 	if (func == NULL) {
2230 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2231 		ASSERT(error == 0);
2232 		error = ENODEV;
2233 		goto done;
2234 	}
2235 
2236 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2237 	if (error == KERN_SUCCESS) {
2238 		thread_precedence_policy_data_t info;
2239 		__unused kern_return_t kret;
2240 
2241 		bzero(&info, sizeof(info));
2242 		info.importance = 0;
2243 		kret = thread_policy_set(inp->dlth_thread,
2244 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2245 		    THREAD_PRECEDENCE_POLICY_COUNT);
2246 		ASSERT(kret == KERN_SUCCESS);
2247 		/*
2248 		 * We create an affinity set so that the matching workloop
2249 		 * thread or the starter thread (for loopback) can be
2250 		 * scheduled on the same processor set as the input thread.
2251 		 */
2252 		if (net_affinity) {
2253 			struct thread *tp = inp->dlth_thread;
2254 			u_int32_t tag;
2255 			/*
2256 			 * Randomize to reduce the probability
2257 			 * of affinity tag namespace collision.
2258 			 */
2259 			read_frandom(&tag, sizeof(tag));
2260 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2261 				thread_reference(tp);
2262 				inp->dlth_affinity_tag = tag;
2263 				inp->dlth_affinity = TRUE;
2264 			}
2265 		}
2266 	} else if (inp == dlil_main_input_thread) {
2267 		panic_plain("%s: couldn't create main input thread", __func__);
2268 		/* NOTREACHED */
2269 	} else {
2270 		panic_plain("%s: couldn't create %s input thread", __func__,
2271 		    if_name(ifp));
2272 		/* NOTREACHED */
2273 	}
2274 	OSAddAtomic(1, &cur_dlil_input_threads);
2275 
2276 done:
2277 	return error;
2278 }
2279 
2280 #if TEST_INPUT_THREAD_TERMINATION
2281 static int
2282 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2283 {
2284 #pragma unused(arg1, arg2)
2285 	uint32_t i;
2286 	int err;
2287 
2288 	i = if_input_thread_termination_spin;
2289 
2290 	err = sysctl_handle_int(oidp, &i, 0, req);
2291 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2292 		return err;
2293 	}
2294 
2295 	if (net_rxpoll == 0) {
2296 		return ENXIO;
2297 	}
2298 
2299 	if_input_thread_termination_spin = i;
2300 	return err;
2301 }
2302 #endif /* TEST_INPUT_THREAD_TERMINATION */
2303 
2304 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2305 dlil_clean_threading_info(struct dlil_threading_info *inp)
2306 {
2307 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2308 	lck_grp_free(inp->dlth_lock_grp);
2309 	inp->dlth_lock_grp = NULL;
2310 
2311 	inp->dlth_flags = 0;
2312 	inp->dlth_wtot = 0;
2313 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2314 	inp->dlth_ifp = NULL;
2315 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2316 	qlimit(&inp->dlth_pkts) = 0;
2317 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2318 
2319 	VERIFY(!inp->dlth_affinity);
2320 	inp->dlth_thread = THREAD_NULL;
2321 	inp->dlth_strategy = NULL;
2322 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2323 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2324 	VERIFY(inp->dlth_affinity_tag == 0);
2325 #if IFNET_INPUT_SANITY_CHK
2326 	inp->dlth_pkts_cnt = 0;
2327 #endif /* IFNET_INPUT_SANITY_CHK */
2328 }
2329 
2330 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2331 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2332 {
2333 	struct ifnet *ifp = inp->dlth_ifp;
2334 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2335 
2336 	VERIFY(current_thread() == inp->dlth_thread);
2337 	VERIFY(inp != dlil_main_input_thread);
2338 
2339 	OSAddAtomic(-1, &cur_dlil_input_threads);
2340 
2341 #if TEST_INPUT_THREAD_TERMINATION
2342 	{ /* do something useless that won't get optimized away */
2343 		uint32_t        v = 1;
2344 		for (uint32_t i = 0;
2345 		    i < if_input_thread_termination_spin;
2346 		    i++) {
2347 			v = (i + 1) * v;
2348 		}
2349 		DLIL_PRINTF("the value is %d\n", v);
2350 	}
2351 #endif /* TEST_INPUT_THREAD_TERMINATION */
2352 
2353 	lck_mtx_lock_spin(&inp->dlth_lock);
2354 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2355 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2356 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2357 	wakeup_one((caddr_t)&inp->dlth_flags);
2358 	lck_mtx_unlock(&inp->dlth_lock);
2359 
2360 	/* free up pending packets */
2361 	if (pkt.cp_mbuf != NULL) {
2362 		mbuf_freem_list(pkt.cp_mbuf);
2363 	}
2364 
2365 	/* for the extra refcnt from kernel_thread_start() */
2366 	thread_deallocate(current_thread());
2367 
2368 	if (dlil_verbose) {
2369 		DLIL_PRINTF("%s: input thread terminated\n",
2370 		    if_name(ifp));
2371 	}
2372 
2373 	/* this is the end */
2374 	thread_terminate(current_thread());
2375 	/* NOTREACHED */
2376 }
2377 
2378 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2379 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2380 {
2381 	thread_affinity_policy_data_t policy;
2382 
2383 	bzero(&policy, sizeof(policy));
2384 	policy.affinity_tag = tag;
2385 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2386 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2387 }
2388 
2389 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2390 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2391 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2392     enum net_filter_event_subsystems state)
2393 {
2394 	if (state == 0) {
2395 		if_enable_fsw_transport_netagent = 1;
2396 	} else {
2397 		if_enable_fsw_transport_netagent = 0;
2398 	}
2399 	kern_nexus_update_netagents();
2400 }
2401 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2402 
2403 void
dlil_init(void)2404 dlil_init(void)
2405 {
2406 	thread_t thread = THREAD_NULL;
2407 
2408 	/*
2409 	 * The following fields must be 64-bit aligned for atomic operations.
2410 	 */
2411 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2412 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2413 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2414 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2415 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2416 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2417 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2418 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2419 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2420 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2421 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2422 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2423 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2424 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2425 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2426 
2427 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2428 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2429 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2430 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2431 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2432 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2433 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2434 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2435 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2436 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2437 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2438 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2439 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2440 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2441 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2442 
2443 	/*
2444 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2445 	 */
2446 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2447 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2448 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2449 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2450 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2451 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2452 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2453 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2454 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2455 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2456 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2457 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2458 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2459 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2460 
2461 	/*
2462 	 * ... as well as the mbuf checksum flags counterparts.
2463 	 */
2464 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2465 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2466 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2467 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2468 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2469 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2470 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2471 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2472 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2473 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2474 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2475 
2476 	/*
2477 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2478 	 */
2479 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2480 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2481 
2482 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2483 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2484 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2485 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2486 
2487 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2488 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2489 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2490 
2491 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2492 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2493 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2494 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2495 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2496 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2497 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2498 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2499 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2500 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2501 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2502 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2503 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2504 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2505 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2506 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2507 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2508 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2509 
2510 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2511 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2512 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2513 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2514 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2515 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2516 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2517 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2518 	_CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2519 
2520 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2521 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2522 
2523 	PE_parse_boot_argn("net_affinity", &net_affinity,
2524 	    sizeof(net_affinity));
2525 
2526 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2527 
2528 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2529 
2530 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2531 
2532 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2533 
2534 	VERIFY(dlil_pending_thread_cnt == 0);
2535 #if SKYWALK
2536 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2537 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2538 	boolean_t enable_fsw_netagent =
2539 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2540 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2541 
2542 	/*
2543 	 * Check the device tree to see if Skywalk netagent has been explicitly
2544 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2545 	 * Note that the property is a 0-length key, and so checking for the
2546 	 * presence itself is enough (no need to check for the actual value of
2547 	 * the retrieved variable.)
2548 	 */
2549 	pe_enable_fsw_transport_netagent =
2550 	    PE_get_default("kern.skywalk_netagent_enable",
2551 	    &pe_enable_fsw_transport_netagent,
2552 	    sizeof(pe_enable_fsw_transport_netagent));
2553 	pe_disable_fsw_transport_netagent =
2554 	    PE_get_default("kern.skywalk_netagent_disable",
2555 	    &pe_disable_fsw_transport_netagent,
2556 	    sizeof(pe_disable_fsw_transport_netagent));
2557 
2558 	/*
2559 	 * These two are mutually exclusive, i.e. they both can be absent,
2560 	 * but only one can be present at a time, and so we assert to make
2561 	 * sure it is correct.
2562 	 */
2563 	VERIFY((!pe_enable_fsw_transport_netagent &&
2564 	    !pe_disable_fsw_transport_netagent) ||
2565 	    (pe_enable_fsw_transport_netagent ^
2566 	    pe_disable_fsw_transport_netagent));
2567 
2568 	if (pe_enable_fsw_transport_netagent) {
2569 		kprintf("SK: netagent is enabled via an override for "
2570 		    "this platform\n");
2571 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2572 	} else if (pe_disable_fsw_transport_netagent) {
2573 		kprintf("SK: netagent is disabled via an override for "
2574 		    "this platform\n");
2575 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2576 	} else {
2577 		kprintf("SK: netagent is %s by default for this platform\n",
2578 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2579 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2580 	}
2581 
2582 	/*
2583 	 * Now see if there's a boot-arg override.
2584 	 */
2585 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2586 	    sizeof(if_attach_nx));
2587 	if_enable_fsw_transport_netagent =
2588 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2589 
2590 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2591 
2592 	if (pe_disable_fsw_transport_netagent &&
2593 	    if_enable_fsw_transport_netagent) {
2594 		kprintf("SK: netagent is force-enabled\n");
2595 	} else if (!pe_disable_fsw_transport_netagent &&
2596 	    !if_enable_fsw_transport_netagent) {
2597 		kprintf("SK: netagent is force-disabled\n");
2598 	}
2599 #ifdef XNU_TARGET_OS_OSX
2600 	if (if_enable_fsw_transport_netagent) {
2601 		net_filter_event_register(dlil_filter_event);
2602 	}
2603 #endif /* XNU_TARGET_OS_OSX */
2604 
2605 #if (DEVELOPMENT || DEBUG)
2606 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2607 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2608 #endif /* (DEVELOPMENT || DEBUG) */
2609 
2610 #endif /* SKYWALK */
2611 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2612 	    sizeof(struct dlil_ifnet_dbg);
2613 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2614 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2615 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2616 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2617 
2618 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2619 	/* Enforce 64-bit alignment for tcpstat_local structure */
2620 	dlif_tcpstat_bufsize =
2621 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2622 	dlif_tcpstat_bufsize = (uint32_t)
2623 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2624 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2625 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2626 
2627 	dlif_udpstat_size = sizeof(struct udpstat_local);
2628 	/* Enforce 64-bit alignment for udpstat_local structure */
2629 	dlif_udpstat_bufsize =
2630 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2631 	dlif_udpstat_bufsize = (uint32_t)
2632 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2633 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2634 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2635 
2636 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2637 
2638 	TAILQ_INIT(&dlil_ifnet_head);
2639 	TAILQ_INIT(&ifnet_head);
2640 	TAILQ_INIT(&ifnet_detaching_head);
2641 	TAILQ_INIT(&ifnet_ordered_head);
2642 
2643 	/* Initialize interface address subsystem */
2644 	ifa_init();
2645 
2646 #if PF
2647 	/* Initialize the packet filter */
2648 	pfinit();
2649 #endif /* PF */
2650 
2651 	/* Initialize queue algorithms */
2652 	classq_init();
2653 
2654 	/* Initialize packet schedulers */
2655 	pktsched_init();
2656 
2657 	/* Initialize flow advisory subsystem */
2658 	flowadv_init();
2659 
2660 	/* Initialize the pktap virtual interface */
2661 	pktap_init();
2662 
2663 	/* Initialize the service class to dscp map */
2664 	net_qos_map_init();
2665 
2666 	/* Initialize the interface low power mode event handler */
2667 	if_low_power_evhdlr_init();
2668 
2669 	/* Initialize the interface offload port list subsystem */
2670 	if_ports_used_init();
2671 
2672 #if DEBUG || DEVELOPMENT
2673 	/* Run self-tests */
2674 	dlil_verify_sum16();
2675 #endif /* DEBUG || DEVELOPMENT */
2676 
2677 	/*
2678 	 * Create and start up the main DLIL input thread and the interface
2679 	 * detacher threads once everything is initialized.
2680 	 */
2681 	dlil_incr_pending_thread_count();
2682 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2683 
2684 	/*
2685 	 * Create ifnet detacher thread.
2686 	 * When an interface gets detached, part of the detach processing
2687 	 * is delayed. The interface is added to delayed detach list
2688 	 * and this thread is woken up to call ifnet_detach_final
2689 	 * on these interfaces.
2690 	 */
2691 	dlil_incr_pending_thread_count();
2692 	if (kernel_thread_start(ifnet_detacher_thread_func,
2693 	    NULL, &thread) != KERN_SUCCESS) {
2694 		panic_plain("%s: couldn't create detacher thread", __func__);
2695 		/* NOTREACHED */
2696 	}
2697 	thread_deallocate(thread);
2698 
2699 	/*
2700 	 * Wait for the created kernel threads for dlil to get
2701 	 * scheduled and run at least once before we proceed
2702 	 */
2703 	lck_mtx_lock(&dlil_thread_sync_lock);
2704 	while (dlil_pending_thread_cnt != 0) {
2705 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2706 		    "threads to get scheduled at least once.\n", __func__);
2707 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2708 		    (PZERO - 1), __func__, NULL);
2709 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2710 	}
2711 	lck_mtx_unlock(&dlil_thread_sync_lock);
2712 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2713 	    "scheduled at least once. Proceeding.\n", __func__);
2714 }
2715 
2716 static void
if_flt_monitor_busy(struct ifnet * ifp)2717 if_flt_monitor_busy(struct ifnet *ifp)
2718 {
2719 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2720 
2721 	++ifp->if_flt_busy;
2722 	VERIFY(ifp->if_flt_busy != 0);
2723 }
2724 
2725 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2726 if_flt_monitor_unbusy(struct ifnet *ifp)
2727 {
2728 	if_flt_monitor_leave(ifp);
2729 }
2730 
2731 static void
if_flt_monitor_enter(struct ifnet * ifp)2732 if_flt_monitor_enter(struct ifnet *ifp)
2733 {
2734 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2735 
2736 	while (ifp->if_flt_busy) {
2737 		++ifp->if_flt_waiters;
2738 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2739 		    (PZERO - 1), "if_flt_monitor", NULL);
2740 	}
2741 	if_flt_monitor_busy(ifp);
2742 }
2743 
2744 static void
if_flt_monitor_leave(struct ifnet * ifp)2745 if_flt_monitor_leave(struct ifnet *ifp)
2746 {
2747 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2748 
2749 	VERIFY(ifp->if_flt_busy != 0);
2750 	--ifp->if_flt_busy;
2751 
2752 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2753 		ifp->if_flt_waiters = 0;
2754 		wakeup(&ifp->if_flt_head);
2755 	}
2756 }
2757 
2758 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2759 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2760     interface_filter_t *filter_ref, u_int32_t flags)
2761 {
2762 	int retval = 0;
2763 	struct ifnet_filter *filter = NULL;
2764 
2765 	ifnet_head_lock_shared();
2766 
2767 	/* Check that the interface is in the global list */
2768 	if (!ifnet_lookup(ifp)) {
2769 		retval = ENXIO;
2770 		goto done;
2771 	}
2772 	if (!ifnet_is_attached(ifp, 1)) {
2773 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2774 		    __func__, if_name(ifp));
2775 		retval = ENXIO;
2776 		goto done;
2777 	}
2778 
2779 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2780 
2781 	/* refcnt held above during lookup */
2782 	filter->filt_flags = flags;
2783 	filter->filt_ifp = ifp;
2784 	filter->filt_cookie = if_filter->iff_cookie;
2785 	filter->filt_name = if_filter->iff_name;
2786 	filter->filt_protocol = if_filter->iff_protocol;
2787 	/*
2788 	 * Do not install filter callbacks for internal coproc interface
2789 	 */
2790 	if (!IFNET_IS_INTCOPROC(ifp)) {
2791 		filter->filt_input = if_filter->iff_input;
2792 		filter->filt_output = if_filter->iff_output;
2793 		filter->filt_event = if_filter->iff_event;
2794 		filter->filt_ioctl = if_filter->iff_ioctl;
2795 	}
2796 	filter->filt_detached = if_filter->iff_detached;
2797 
2798 	lck_mtx_lock(&ifp->if_flt_lock);
2799 	if_flt_monitor_enter(ifp);
2800 
2801 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2802 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2803 
2804 	*filter_ref = filter;
2805 
2806 	/*
2807 	 * Bump filter count and route_generation ID to let TCP
2808 	 * know it shouldn't do TSO on this connection
2809 	 */
2810 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2811 		ifnet_filter_update_tso(ifp, TRUE);
2812 	}
2813 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2814 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2815 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2816 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2817 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2818 	} else {
2819 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2820 	}
2821 	if_flt_monitor_leave(ifp);
2822 	lck_mtx_unlock(&ifp->if_flt_lock);
2823 
2824 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2825 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2826 	    net_check_compatible_if_filter(NULL));
2827 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2828 
2829 	if (dlil_verbose) {
2830 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2831 		    if_filter->iff_name);
2832 	}
2833 	ifnet_decr_iorefcnt(ifp);
2834 
2835 done:
2836 	ifnet_head_done();
2837 	if (retval != 0 && ifp != NULL) {
2838 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2839 		    if_name(ifp), if_filter->iff_name, retval);
2840 	}
2841 	if (retval != 0 && filter != NULL) {
2842 		zfree(dlif_filt_zone, filter);
2843 	}
2844 
2845 	return retval;
2846 }
2847 
2848 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2849 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2850 {
2851 	int retval = 0;
2852 
2853 	if (detached == 0) {
2854 		ifnet_t ifp = NULL;
2855 
2856 		ifnet_head_lock_shared();
2857 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2858 			interface_filter_t entry = NULL;
2859 
2860 			lck_mtx_lock(&ifp->if_flt_lock);
2861 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2862 				if (entry != filter || entry->filt_skip) {
2863 					continue;
2864 				}
2865 				/*
2866 				 * We've found a match; since it's possible
2867 				 * that the thread gets blocked in the monitor,
2868 				 * we do the lock dance.  Interface should
2869 				 * not be detached since we still have a use
2870 				 * count held during filter attach.
2871 				 */
2872 				entry->filt_skip = 1;   /* skip input/output */
2873 				lck_mtx_unlock(&ifp->if_flt_lock);
2874 				ifnet_head_done();
2875 
2876 				lck_mtx_lock(&ifp->if_flt_lock);
2877 				if_flt_monitor_enter(ifp);
2878 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2879 				    LCK_MTX_ASSERT_OWNED);
2880 
2881 				/* Remove the filter from the list */
2882 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2883 				    filt_next);
2884 
2885 				if (dlil_verbose) {
2886 					DLIL_PRINTF("%s: %s filter detached\n",
2887 					    if_name(ifp), filter->filt_name);
2888 				}
2889 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2890 					VERIFY(ifp->if_flt_non_os_count != 0);
2891 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2892 				}
2893 				/*
2894 				 * Decrease filter count and route_generation
2895 				 * ID to let TCP know it should reevalute doing
2896 				 * TSO or not.
2897 				 */
2898 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2899 					ifnet_filter_update_tso(ifp, FALSE);
2900 				}
2901 				if_flt_monitor_leave(ifp);
2902 				lck_mtx_unlock(&ifp->if_flt_lock);
2903 				goto destroy;
2904 			}
2905 			lck_mtx_unlock(&ifp->if_flt_lock);
2906 		}
2907 		ifnet_head_done();
2908 
2909 		/* filter parameter is not a valid filter ref */
2910 		retval = EINVAL;
2911 		goto done;
2912 	} else {
2913 		struct ifnet *ifp = filter->filt_ifp;
2914 		/*
2915 		 * Here we are called from ifnet_detach_final(); the
2916 		 * caller had emptied if_flt_head and we're doing an
2917 		 * implicit filter detach because the interface is
2918 		 * about to go away.  Make sure to adjust the counters
2919 		 * in this case.  We don't need the protection of the
2920 		 * filter monitor since we're called as part of the
2921 		 * final detach in the context of the detacher thread.
2922 		 */
2923 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2924 			VERIFY(ifp->if_flt_non_os_count != 0);
2925 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2926 		}
2927 		/*
2928 		 * Decrease filter count and route_generation
2929 		 * ID to let TCP know it should reevalute doing
2930 		 * TSO or not.
2931 		 */
2932 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2933 			ifnet_filter_update_tso(ifp, FALSE);
2934 		}
2935 	}
2936 
2937 	if (dlil_verbose) {
2938 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2939 	}
2940 
2941 destroy:
2942 
2943 	/* Call the detached function if there is one */
2944 	if (filter->filt_detached) {
2945 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2946 	}
2947 
2948 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2949 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2950 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2951 	}
2952 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2953 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2954 	    net_check_compatible_if_filter(NULL));
2955 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2956 
2957 	/* Free the filter */
2958 	zfree(dlif_filt_zone, filter);
2959 	filter = NULL;
2960 done:
2961 	if (retval != 0 && filter != NULL) {
2962 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2963 		    filter->filt_name, retval);
2964 	}
2965 
2966 	return retval;
2967 }
2968 
2969 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2970 dlil_detach_filter(interface_filter_t filter)
2971 {
2972 	if (filter == NULL) {
2973 		return;
2974 	}
2975 	dlil_detach_filter_internal(filter, 0);
2976 }
2977 
2978 __private_extern__ boolean_t
dlil_has_ip_filter(void)2979 dlil_has_ip_filter(void)
2980 {
2981 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2982 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2983 	return has_filter;
2984 }
2985 
2986 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2987 dlil_has_if_filter(struct ifnet *ifp)
2988 {
2989 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2990 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2991 	return has_filter;
2992 }
2993 
2994 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2995 dlil_input_wakeup(struct dlil_threading_info *inp)
2996 {
2997 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2998 
2999 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3000 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3001 		inp->dlth_wtot++;
3002 		wakeup_one((caddr_t)&inp->dlth_flags);
3003 	}
3004 }
3005 
3006 __attribute__((noreturn))
3007 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3008 dlil_main_input_thread_func(void *v, wait_result_t w)
3009 {
3010 #pragma unused(w)
3011 	struct dlil_threading_info *inp = v;
3012 
3013 	VERIFY(inp == dlil_main_input_thread);
3014 	VERIFY(inp->dlth_ifp == NULL);
3015 	VERIFY(current_thread() == inp->dlth_thread);
3016 
3017 	lck_mtx_lock(&inp->dlth_lock);
3018 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3019 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3020 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3021 	/* wake up once to get out of embryonic state */
3022 	dlil_input_wakeup(inp);
3023 	lck_mtx_unlock(&inp->dlth_lock);
3024 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3025 	/* NOTREACHED */
3026 	__builtin_unreachable();
3027 }
3028 
3029 /*
3030  * Main input thread:
3031  *
3032  *   a) handles all inbound packets for lo0
3033  *   b) handles all inbound packets for interfaces with no dedicated
3034  *	input thread (e.g. anything but Ethernet/PDP or those that support
3035  *	opportunistic polling.)
3036  *   c) protocol registrations
3037  *   d) packet injections
3038  */
3039 __attribute__((noreturn))
3040 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3041 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3042 {
3043 	struct dlil_main_threading_info *inpm = v;
3044 	struct dlil_threading_info *inp = v;
3045 
3046 	/* main input thread is uninterruptible */
3047 	VERIFY(wres != THREAD_INTERRUPTED);
3048 	lck_mtx_lock_spin(&inp->dlth_lock);
3049 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3050 	    DLIL_INPUT_RUNNING)));
3051 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3052 
3053 	while (1) {
3054 		struct mbuf *m = NULL, *m_loop = NULL;
3055 		u_int32_t m_cnt, m_cnt_loop;
3056 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3057 		boolean_t proto_req;
3058 		boolean_t embryonic;
3059 
3060 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3061 
3062 		if (__improbable(embryonic =
3063 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3064 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3065 		}
3066 
3067 		proto_req = (inp->dlth_flags &
3068 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3069 
3070 		/* Packets for non-dedicated interfaces other than lo0 */
3071 		m_cnt = qlen(&inp->dlth_pkts);
3072 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3073 		m = pkt.cp_mbuf;
3074 
3075 		/* Packets exclusive to lo0 */
3076 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3077 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3078 		m_loop = pkt.cp_mbuf;
3079 
3080 		inp->dlth_wtot = 0;
3081 
3082 		lck_mtx_unlock(&inp->dlth_lock);
3083 
3084 		if (__improbable(embryonic)) {
3085 			dlil_decr_pending_thread_count();
3086 		}
3087 
3088 		/*
3089 		 * NOTE warning %%% attention !!!!
3090 		 * We should think about putting some thread starvation
3091 		 * safeguards if we deal with long chains of packets.
3092 		 */
3093 		if (__probable(m_loop != NULL)) {
3094 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3095 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3096 		}
3097 
3098 		if (__probable(m != NULL)) {
3099 			dlil_input_packet_list_extended(NULL, m,
3100 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3101 		}
3102 
3103 		if (__improbable(proto_req)) {
3104 			proto_input_run();
3105 		}
3106 
3107 		lck_mtx_lock_spin(&inp->dlth_lock);
3108 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3109 		/* main input thread cannot be terminated */
3110 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3111 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3112 			break;
3113 		}
3114 	}
3115 
3116 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3117 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3118 	lck_mtx_unlock(&inp->dlth_lock);
3119 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3120 
3121 	VERIFY(0);      /* we should never get here */
3122 	/* NOTREACHED */
3123 	__builtin_unreachable();
3124 }
3125 
3126 /*
3127  * Input thread for interfaces with legacy input model.
3128  */
3129 __attribute__((noreturn))
3130 static void
dlil_input_thread_func(void * v,wait_result_t w)3131 dlil_input_thread_func(void *v, wait_result_t w)
3132 {
3133 #pragma unused(w)
3134 	char thread_name[MAXTHREADNAMESIZE];
3135 	struct dlil_threading_info *inp = v;
3136 	struct ifnet *ifp = inp->dlth_ifp;
3137 
3138 	VERIFY(inp != dlil_main_input_thread);
3139 	VERIFY(ifp != NULL);
3140 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3141 	    !(ifp->if_xflags & IFXF_LEGACY));
3142 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3143 	    !(ifp->if_xflags & IFXF_LEGACY));
3144 	VERIFY(current_thread() == inp->dlth_thread);
3145 
3146 	/* construct the name for this thread, and then apply it */
3147 	bzero(thread_name, sizeof(thread_name));
3148 	(void) snprintf(thread_name, sizeof(thread_name),
3149 	    "dlil_input_%s", ifp->if_xname);
3150 	thread_set_thread_name(inp->dlth_thread, thread_name);
3151 
3152 	lck_mtx_lock(&inp->dlth_lock);
3153 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3154 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3155 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3156 	/* wake up once to get out of embryonic state */
3157 	dlil_input_wakeup(inp);
3158 	lck_mtx_unlock(&inp->dlth_lock);
3159 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3160 	/* NOTREACHED */
3161 	__builtin_unreachable();
3162 }
3163 
3164 __attribute__((noreturn))
3165 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3166 dlil_input_thread_cont(void *v, wait_result_t wres)
3167 {
3168 	struct dlil_threading_info *inp = v;
3169 	struct ifnet *ifp = inp->dlth_ifp;
3170 
3171 	lck_mtx_lock_spin(&inp->dlth_lock);
3172 	if (__improbable(wres == THREAD_INTERRUPTED ||
3173 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3174 		goto terminate;
3175 	}
3176 
3177 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3178 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3179 
3180 	while (1) {
3181 		struct mbuf *m = NULL;
3182 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3183 		boolean_t notify = FALSE;
3184 		boolean_t embryonic;
3185 		u_int32_t m_cnt;
3186 
3187 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3188 
3189 		if (__improbable(embryonic =
3190 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3191 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3192 		}
3193 
3194 		/*
3195 		 * Protocol registration and injection must always use
3196 		 * the main input thread; in theory the latter can utilize
3197 		 * the corresponding input thread where the packet arrived
3198 		 * on, but that requires our knowing the interface in advance
3199 		 * (and the benefits might not worth the trouble.)
3200 		 */
3201 		VERIFY(!(inp->dlth_flags &
3202 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3203 
3204 		/* Packets for this interface */
3205 		m_cnt = qlen(&inp->dlth_pkts);
3206 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3207 		m = pkt.cp_mbuf;
3208 
3209 		inp->dlth_wtot = 0;
3210 
3211 #if SKYWALK
3212 		/*
3213 		 * If this interface is attached to a netif nexus,
3214 		 * the stats are already incremented there; otherwise
3215 		 * do it here.
3216 		 */
3217 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3218 #endif /* SKYWALK */
3219 		notify = dlil_input_stats_sync(ifp, inp);
3220 
3221 		lck_mtx_unlock(&inp->dlth_lock);
3222 
3223 		if (__improbable(embryonic)) {
3224 			ifnet_decr_pending_thread_count(ifp);
3225 		}
3226 
3227 		if (__improbable(notify)) {
3228 			ifnet_notify_data_threshold(ifp);
3229 		}
3230 
3231 		/*
3232 		 * NOTE warning %%% attention !!!!
3233 		 * We should think about putting some thread starvation
3234 		 * safeguards if we deal with long chains of packets.
3235 		 */
3236 		if (__probable(m != NULL)) {
3237 			dlil_input_packet_list_extended(NULL, m,
3238 			    m_cnt, ifp->if_poll_mode);
3239 		}
3240 
3241 		lck_mtx_lock_spin(&inp->dlth_lock);
3242 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3243 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3244 		    DLIL_INPUT_TERMINATE))) {
3245 			break;
3246 		}
3247 	}
3248 
3249 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3250 
3251 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3252 terminate:
3253 		lck_mtx_unlock(&inp->dlth_lock);
3254 		dlil_terminate_input_thread(inp);
3255 		/* NOTREACHED */
3256 	} else {
3257 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3258 		lck_mtx_unlock(&inp->dlth_lock);
3259 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3260 		/* NOTREACHED */
3261 	}
3262 
3263 	VERIFY(0);      /* we should never get here */
3264 	/* NOTREACHED */
3265 	__builtin_unreachable();
3266 }
3267 
3268 /*
3269  * Input thread for interfaces with opportunistic polling input model.
3270  */
3271 __attribute__((noreturn))
3272 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3273 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3274 {
3275 #pragma unused(w)
3276 	char thread_name[MAXTHREADNAMESIZE];
3277 	struct dlil_threading_info *inp = v;
3278 	struct ifnet *ifp = inp->dlth_ifp;
3279 
3280 	VERIFY(inp != dlil_main_input_thread);
3281 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3282 	    (ifp->if_xflags & IFXF_LEGACY));
3283 	VERIFY(current_thread() == inp->dlth_thread);
3284 
3285 	/* construct the name for this thread, and then apply it */
3286 	bzero(thread_name, sizeof(thread_name));
3287 	(void) snprintf(thread_name, sizeof(thread_name),
3288 	    "dlil_input_poll_%s", ifp->if_xname);
3289 	thread_set_thread_name(inp->dlth_thread, thread_name);
3290 
3291 	lck_mtx_lock(&inp->dlth_lock);
3292 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3293 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3294 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3295 	/* wake up once to get out of embryonic state */
3296 	dlil_input_wakeup(inp);
3297 	lck_mtx_unlock(&inp->dlth_lock);
3298 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3299 	/* NOTREACHED */
3300 	__builtin_unreachable();
3301 }
3302 
3303 __attribute__((noreturn))
3304 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3305 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3306 {
3307 	struct dlil_threading_info *inp = v;
3308 	struct ifnet *ifp = inp->dlth_ifp;
3309 	struct timespec ts;
3310 
3311 	lck_mtx_lock_spin(&inp->dlth_lock);
3312 	if (__improbable(wres == THREAD_INTERRUPTED ||
3313 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3314 		goto terminate;
3315 	}
3316 
3317 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3318 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3319 
3320 	while (1) {
3321 		struct mbuf *m = NULL;
3322 		uint32_t m_cnt, poll_req = 0;
3323 		uint64_t m_size = 0;
3324 		ifnet_model_t mode;
3325 		struct timespec now, delta;
3326 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3327 		boolean_t notify;
3328 		boolean_t embryonic;
3329 		uint64_t ival;
3330 
3331 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3332 
3333 		if (__improbable(embryonic =
3334 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3335 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3336 			goto skip;
3337 		}
3338 
3339 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3340 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3341 		}
3342 
3343 		/* Link parameters changed? */
3344 		if (ifp->if_poll_update != 0) {
3345 			ifp->if_poll_update = 0;
3346 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3347 		}
3348 
3349 		/* Current operating mode */
3350 		mode = ifp->if_poll_mode;
3351 
3352 		/*
3353 		 * Protocol registration and injection must always use
3354 		 * the main input thread; in theory the latter can utilize
3355 		 * the corresponding input thread where the packet arrived
3356 		 * on, but that requires our knowing the interface in advance
3357 		 * (and the benefits might not worth the trouble.)
3358 		 */
3359 		VERIFY(!(inp->dlth_flags &
3360 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3361 
3362 		/* Total count of all packets */
3363 		m_cnt = qlen(&inp->dlth_pkts);
3364 
3365 		/* Total bytes of all packets */
3366 		m_size = qsize(&inp->dlth_pkts);
3367 
3368 		/* Packets for this interface */
3369 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3370 		m = pkt.cp_mbuf;
3371 		VERIFY(m != NULL || m_cnt == 0);
3372 
3373 		nanouptime(&now);
3374 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3375 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3376 		}
3377 
3378 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3379 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3380 			u_int32_t ptot, btot;
3381 
3382 			/* Accumulate statistics for current sampling */
3383 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3384 
3385 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3386 				goto skip;
3387 			}
3388 
3389 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3390 
3391 			/* Calculate min/max of inbound bytes */
3392 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3393 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3394 				ifp->if_rxpoll_bmin = btot;
3395 			}
3396 			if (btot > ifp->if_rxpoll_bmax) {
3397 				ifp->if_rxpoll_bmax = btot;
3398 			}
3399 
3400 			/* Calculate EWMA of inbound bytes */
3401 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3402 
3403 			/* Calculate min/max of inbound packets */
3404 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3405 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3406 				ifp->if_rxpoll_pmin = ptot;
3407 			}
3408 			if (ptot > ifp->if_rxpoll_pmax) {
3409 				ifp->if_rxpoll_pmax = ptot;
3410 			}
3411 
3412 			/* Calculate EWMA of inbound packets */
3413 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3414 
3415 			/* Reset sampling statistics */
3416 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3417 
3418 			/* Calculate EWMA of wakeup requests */
3419 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3420 			    if_rxpoll_decay);
3421 			inp->dlth_wtot = 0;
3422 
3423 			if (dlil_verbose) {
3424 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3425 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3426 				}
3427 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3428 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3429 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3430 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3431 					    "limits [%d/%d], wreq avg %d "
3432 					    "limits [%d/%d], bytes avg %d "
3433 					    "limits [%d/%d]\n", if_name(ifp),
3434 					    (ifp->if_poll_mode ==
3435 					    IFNET_MODEL_INPUT_POLL_ON) ?
3436 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3437 					    ifp->if_rxpoll_pmax,
3438 					    ifp->if_rxpoll_plowat,
3439 					    ifp->if_rxpoll_phiwat,
3440 					    ifp->if_rxpoll_wavg,
3441 					    ifp->if_rxpoll_wlowat,
3442 					    ifp->if_rxpoll_whiwat,
3443 					    ifp->if_rxpoll_bavg,
3444 					    ifp->if_rxpoll_blowat,
3445 					    ifp->if_rxpoll_bhiwat);
3446 				}
3447 			}
3448 
3449 			/* Perform mode transition, if necessary */
3450 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3451 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3452 			}
3453 
3454 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3455 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3456 				goto skip;
3457 			}
3458 
3459 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3460 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3461 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3462 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3463 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3464 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3465 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3466 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3467 				mode = IFNET_MODEL_INPUT_POLL_ON;
3468 			}
3469 
3470 			if (mode != ifp->if_poll_mode) {
3471 				ifp->if_poll_mode = mode;
3472 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3473 				poll_req++;
3474 			}
3475 		}
3476 skip:
3477 		notify = dlil_input_stats_sync(ifp, inp);
3478 
3479 		lck_mtx_unlock(&inp->dlth_lock);
3480 
3481 		if (__improbable(embryonic)) {
3482 			ifnet_decr_pending_thread_count(ifp);
3483 		}
3484 
3485 		if (__improbable(notify)) {
3486 			ifnet_notify_data_threshold(ifp);
3487 		}
3488 
3489 		/*
3490 		 * If there's a mode change and interface is still attached,
3491 		 * perform a downcall to the driver for the new mode.  Also
3492 		 * hold an IO refcnt on the interface to prevent it from
3493 		 * being detached (will be release below.)
3494 		 */
3495 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3496 			struct ifnet_model_params p = {
3497 				.model = mode, .reserved = { 0 }
3498 			};
3499 			errno_t err;
3500 
3501 			if (dlil_verbose) {
3502 				DLIL_PRINTF("%s: polling is now %s, "
3503 				    "pkts avg %d max %d limits [%d/%d], "
3504 				    "wreq avg %d limits [%d/%d], "
3505 				    "bytes avg %d limits [%d/%d]\n",
3506 				    if_name(ifp),
3507 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3508 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3509 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3510 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3511 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3512 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3513 				    ifp->if_rxpoll_bhiwat);
3514 			}
3515 
3516 			if ((err = ((*ifp->if_input_ctl)(ifp,
3517 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3518 				DLIL_PRINTF("%s: error setting polling mode "
3519 				    "to %s (%d)\n", if_name(ifp),
3520 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3521 				    "ON" : "OFF", err);
3522 			}
3523 
3524 			switch (mode) {
3525 			case IFNET_MODEL_INPUT_POLL_OFF:
3526 				ifnet_set_poll_cycle(ifp, NULL);
3527 				ifp->if_rxpoll_offreq++;
3528 				if (err != 0) {
3529 					ifp->if_rxpoll_offerr++;
3530 				}
3531 				break;
3532 
3533 			case IFNET_MODEL_INPUT_POLL_ON:
3534 				net_nsectimer(&ival, &ts);
3535 				ifnet_set_poll_cycle(ifp, &ts);
3536 				ifnet_poll(ifp);
3537 				ifp->if_rxpoll_onreq++;
3538 				if (err != 0) {
3539 					ifp->if_rxpoll_onerr++;
3540 				}
3541 				break;
3542 
3543 			default:
3544 				VERIFY(0);
3545 				/* NOTREACHED */
3546 			}
3547 
3548 			/* Release the IO refcnt */
3549 			ifnet_decr_iorefcnt(ifp);
3550 		}
3551 
3552 		/*
3553 		 * NOTE warning %%% attention !!!!
3554 		 * We should think about putting some thread starvation
3555 		 * safeguards if we deal with long chains of packets.
3556 		 */
3557 		if (__probable(m != NULL)) {
3558 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3559 		}
3560 
3561 		lck_mtx_lock_spin(&inp->dlth_lock);
3562 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3563 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3564 		    DLIL_INPUT_TERMINATE))) {
3565 			break;
3566 		}
3567 	}
3568 
3569 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3570 
3571 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3572 terminate:
3573 		lck_mtx_unlock(&inp->dlth_lock);
3574 		dlil_terminate_input_thread(inp);
3575 		/* NOTREACHED */
3576 	} else {
3577 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3578 		lck_mtx_unlock(&inp->dlth_lock);
3579 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3580 		    inp);
3581 		/* NOTREACHED */
3582 	}
3583 
3584 	VERIFY(0);      /* we should never get here */
3585 	/* NOTREACHED */
3586 	__builtin_unreachable();
3587 }
3588 
3589 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3590 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3591 {
3592 	if (p != NULL) {
3593 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3594 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3595 			return EINVAL;
3596 		}
3597 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3598 		    p->packets_lowat >= p->packets_hiwat) {
3599 			return EINVAL;
3600 		}
3601 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3602 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3603 			return EINVAL;
3604 		}
3605 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3606 		    p->bytes_lowat >= p->bytes_hiwat) {
3607 			return EINVAL;
3608 		}
3609 		if (p->interval_time != 0 &&
3610 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3611 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3612 		}
3613 	}
3614 	return 0;
3615 }
3616 
3617 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3618 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3619 {
3620 	u_int64_t sample_holdtime, inbw;
3621 
3622 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3623 		sample_holdtime = 0;    /* polling is disabled */
3624 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3625 		    ifp->if_rxpoll_blowat = 0;
3626 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3627 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3628 		ifp->if_rxpoll_plim = 0;
3629 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3630 	} else {
3631 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3632 		u_int64_t ival;
3633 		unsigned int n, i;
3634 
3635 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3636 			if (inbw < rxpoll_tbl[i].speed) {
3637 				break;
3638 			}
3639 			n = i;
3640 		}
3641 		/* auto-tune if caller didn't specify a value */
3642 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3643 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3644 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3645 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3646 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3647 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3648 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3649 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3650 		plim = ((p == NULL || p->packets_limit == 0 ||
3651 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3652 		ival = ((p == NULL || p->interval_time == 0 ||
3653 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3654 		    if_rxpoll_interval_time : p->interval_time);
3655 
3656 		VERIFY(plowat != 0 && phiwat != 0);
3657 		VERIFY(blowat != 0 && bhiwat != 0);
3658 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3659 
3660 		sample_holdtime = if_rxpoll_sample_holdtime;
3661 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3662 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3663 		ifp->if_rxpoll_plowat = plowat;
3664 		ifp->if_rxpoll_phiwat = phiwat;
3665 		ifp->if_rxpoll_blowat = blowat;
3666 		ifp->if_rxpoll_bhiwat = bhiwat;
3667 		ifp->if_rxpoll_plim = plim;
3668 		ifp->if_rxpoll_ival = ival;
3669 	}
3670 
3671 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3672 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3673 
3674 	if (dlil_verbose) {
3675 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3676 		    "poll interval %llu nsec, pkts per poll %u, "
3677 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3678 		    "bytes limits [%u/%u]\n", if_name(ifp),
3679 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3680 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3681 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3682 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3683 		    ifp->if_rxpoll_bhiwat);
3684 	}
3685 }
3686 
3687 /*
3688  * Must be called on an attached ifnet (caller is expected to check.)
3689  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3690  */
3691 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3692 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3693     boolean_t locked)
3694 {
3695 	errno_t err;
3696 	struct dlil_threading_info *inp;
3697 
3698 	VERIFY(ifp != NULL);
3699 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3700 		return ENXIO;
3701 	}
3702 	err = dlil_rxpoll_validate_params(p);
3703 	if (err != 0) {
3704 		return err;
3705 	}
3706 
3707 	if (!locked) {
3708 		lck_mtx_lock(&inp->dlth_lock);
3709 	}
3710 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3711 	/*
3712 	 * Normally, we'd reset the parameters to the auto-tuned values
3713 	 * if the the input thread detects a change in link rate.  If the
3714 	 * driver provides its own parameters right after a link rate
3715 	 * changes, but before the input thread gets to run, we want to
3716 	 * make sure to keep the driver's values.  Clearing if_poll_update
3717 	 * will achieve that.
3718 	 */
3719 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3720 		ifp->if_poll_update = 0;
3721 	}
3722 	dlil_rxpoll_update_params(ifp, p);
3723 	if (!locked) {
3724 		lck_mtx_unlock(&inp->dlth_lock);
3725 	}
3726 	return 0;
3727 }
3728 
3729 /*
3730  * Must be called on an attached ifnet (caller is expected to check.)
3731  */
3732 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3733 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3734 {
3735 	struct dlil_threading_info *inp;
3736 
3737 	VERIFY(ifp != NULL && p != NULL);
3738 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3739 		return ENXIO;
3740 	}
3741 
3742 	bzero(p, sizeof(*p));
3743 
3744 	lck_mtx_lock(&inp->dlth_lock);
3745 	p->packets_limit = ifp->if_rxpoll_plim;
3746 	p->packets_lowat = ifp->if_rxpoll_plowat;
3747 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3748 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3749 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3750 	p->interval_time = ifp->if_rxpoll_ival;
3751 	lck_mtx_unlock(&inp->dlth_lock);
3752 
3753 	return 0;
3754 }
3755 
3756 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3757 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3758     const struct ifnet_stat_increment_param *s)
3759 {
3760 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3761 }
3762 
3763 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3764 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3765     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3766 {
3767 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3768 }
3769 
3770 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3771 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3772     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3773 {
3774 	return ifnet_input_common(ifp, m_head, m_tail, s,
3775 	           (m_head != NULL), TRUE);
3776 }
3777 
3778 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3779 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3780     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3781 {
3782 	dlil_input_func input_func;
3783 	struct ifnet_stat_increment_param _s;
3784 	u_int32_t m_cnt = 0, m_size = 0;
3785 	struct mbuf *last;
3786 	errno_t err = 0;
3787 
3788 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3789 		if (m_head != NULL) {
3790 			mbuf_freem_list(m_head);
3791 		}
3792 		return EINVAL;
3793 	}
3794 
3795 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3796 	VERIFY(m_tail == NULL || ext);
3797 	VERIFY(s != NULL || !ext);
3798 
3799 	/*
3800 	 * Drop the packet(s) if the parameters are invalid, or if the
3801 	 * interface is no longer attached; else hold an IO refcnt to
3802 	 * prevent it from being detached (will be released below.)
3803 	 */
3804 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3805 		if (m_head != NULL) {
3806 			mbuf_freem_list(m_head);
3807 		}
3808 		return EINVAL;
3809 	}
3810 
3811 	input_func = ifp->if_input_dlil;
3812 	VERIFY(input_func != NULL);
3813 
3814 	if (m_tail == NULL) {
3815 		last = m_head;
3816 		while (m_head != NULL) {
3817 #if IFNET_INPUT_SANITY_CHK
3818 			if (__improbable(dlil_input_sanity_check != 0)) {
3819 				DLIL_INPUT_CHECK(last, ifp);
3820 			}
3821 #endif /* IFNET_INPUT_SANITY_CHK */
3822 			m_cnt++;
3823 			m_size += m_length(last);
3824 			if (mbuf_nextpkt(last) == NULL) {
3825 				break;
3826 			}
3827 			last = mbuf_nextpkt(last);
3828 		}
3829 		m_tail = last;
3830 	} else {
3831 #if IFNET_INPUT_SANITY_CHK
3832 		if (__improbable(dlil_input_sanity_check != 0)) {
3833 			last = m_head;
3834 			while (1) {
3835 				DLIL_INPUT_CHECK(last, ifp);
3836 				m_cnt++;
3837 				m_size += m_length(last);
3838 				if (mbuf_nextpkt(last) == NULL) {
3839 					break;
3840 				}
3841 				last = mbuf_nextpkt(last);
3842 			}
3843 		} else {
3844 			m_cnt = s->packets_in;
3845 			m_size = s->bytes_in;
3846 			last = m_tail;
3847 		}
3848 #else
3849 		m_cnt = s->packets_in;
3850 		m_size = s->bytes_in;
3851 		last = m_tail;
3852 #endif /* IFNET_INPUT_SANITY_CHK */
3853 	}
3854 
3855 	if (last != m_tail) {
3856 		panic_plain("%s: invalid input packet chain for %s, "
3857 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3858 		    m_tail, last);
3859 	}
3860 
3861 	/*
3862 	 * Assert packet count only for the extended variant, for backwards
3863 	 * compatibility, since this came directly from the device driver.
3864 	 * Relax this assertion for input bytes, as the driver may have
3865 	 * included the link-layer headers in the computation; hence
3866 	 * m_size is just an approximation.
3867 	 */
3868 	if (ext && s->packets_in != m_cnt) {
3869 		panic_plain("%s: input packet count mismatch for %s, "
3870 		    "%d instead of %d\n", __func__, if_name(ifp),
3871 		    s->packets_in, m_cnt);
3872 	}
3873 
3874 	if (s == NULL) {
3875 		bzero(&_s, sizeof(_s));
3876 		s = &_s;
3877 	} else {
3878 		_s = *s;
3879 	}
3880 	_s.packets_in = m_cnt;
3881 	_s.bytes_in = m_size;
3882 
3883 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3884 
3885 	if (ifp != lo_ifp) {
3886 		/* Release the IO refcnt */
3887 		ifnet_datamov_end(ifp);
3888 	}
3889 
3890 	return err;
3891 }
3892 
3893 #if SKYWALK
3894 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3895 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3896 {
3897 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3898 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3899 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3900 }
3901 
3902 void
dlil_reset_input_handler(struct ifnet * ifp)3903 dlil_reset_input_handler(struct ifnet *ifp)
3904 {
3905 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3906 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3907 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3908 		;
3909 	}
3910 }
3911 
3912 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3913 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3914 {
3915 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3916 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3917 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3918 }
3919 
3920 void
dlil_reset_output_handler(struct ifnet * ifp)3921 dlil_reset_output_handler(struct ifnet *ifp)
3922 {
3923 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3924 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3925 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3926 		;
3927 	}
3928 }
3929 #endif /* SKYWALK */
3930 
3931 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3932 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3933 {
3934 	return ifp->if_output(ifp, m);
3935 }
3936 
3937 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3938 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3939     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3940     boolean_t poll, struct thread *tp)
3941 {
3942 	struct dlil_threading_info *inp = ifp->if_inp;
3943 
3944 	if (__improbable(inp == NULL)) {
3945 		inp = dlil_main_input_thread;
3946 	}
3947 
3948 #if (DEVELOPMENT || DEBUG)
3949 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3950 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3951 	} else
3952 #endif /* (DEVELOPMENT || DEBUG) */
3953 	{
3954 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3955 	}
3956 }
3957 
3958 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3959 dlil_input_async(struct dlil_threading_info *inp,
3960     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3961     const struct ifnet_stat_increment_param *s, boolean_t poll,
3962     struct thread *tp)
3963 {
3964 	u_int32_t m_cnt = s->packets_in;
3965 	u_int32_t m_size = s->bytes_in;
3966 	boolean_t notify = FALSE;
3967 
3968 	/*
3969 	 * If there is a matching DLIL input thread associated with an
3970 	 * affinity set, associate this thread with the same set.  We
3971 	 * will only do this once.
3972 	 */
3973 	lck_mtx_lock_spin(&inp->dlth_lock);
3974 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3975 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3976 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3977 		u_int32_t tag = inp->dlth_affinity_tag;
3978 
3979 		if (poll) {
3980 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3981 			inp->dlth_poller_thread = tp;
3982 		} else {
3983 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3984 			inp->dlth_driver_thread = tp;
3985 		}
3986 		lck_mtx_unlock(&inp->dlth_lock);
3987 
3988 		/* Associate the current thread with the new affinity tag */
3989 		(void) dlil_affinity_set(tp, tag);
3990 
3991 		/*
3992 		 * Take a reference on the current thread; during detach,
3993 		 * we will need to refer to it in order to tear down its
3994 		 * affinity.
3995 		 */
3996 		thread_reference(tp);
3997 		lck_mtx_lock_spin(&inp->dlth_lock);
3998 	}
3999 
4000 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4001 
4002 	/*
4003 	 * Because of loopbacked multicast we cannot stuff the ifp in
4004 	 * the rcvif of the packet header: loopback (lo0) packets use a
4005 	 * dedicated list so that we can later associate them with lo_ifp
4006 	 * on their way up the stack.  Packets for other interfaces without
4007 	 * dedicated input threads go to the regular list.
4008 	 */
4009 	if (m_head != NULL) {
4010 		classq_pkt_t head, tail;
4011 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4012 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4013 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4014 			struct dlil_main_threading_info *inpm =
4015 			    (struct dlil_main_threading_info *)inp;
4016 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4017 			    m_cnt, m_size);
4018 		} else {
4019 			_addq_multi(&inp->dlth_pkts, &head, &tail,
4020 			    m_cnt, m_size);
4021 		}
4022 	}
4023 
4024 #if IFNET_INPUT_SANITY_CHK
4025 	if (__improbable(dlil_input_sanity_check != 0)) {
4026 		u_int32_t count = 0, size = 0;
4027 		struct mbuf *m0;
4028 
4029 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4030 			size += m_length(m0);
4031 			count++;
4032 		}
4033 
4034 		if (count != m_cnt) {
4035 			panic_plain("%s: invalid total packet count %u "
4036 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4037 			/* NOTREACHED */
4038 			__builtin_unreachable();
4039 		} else if (size != m_size) {
4040 			panic_plain("%s: invalid total packet size %u "
4041 			    "(expected %u)\n", if_name(ifp), size, m_size);
4042 			/* NOTREACHED */
4043 			__builtin_unreachable();
4044 		}
4045 
4046 		inp->dlth_pkts_cnt += m_cnt;
4047 	}
4048 #endif /* IFNET_INPUT_SANITY_CHK */
4049 
4050 	dlil_input_stats_add(s, inp, ifp, poll);
4051 	/*
4052 	 * If we're using the main input thread, synchronize the
4053 	 * stats now since we have the interface context.  All
4054 	 * other cases involving dedicated input threads will
4055 	 * have their stats synchronized there.
4056 	 */
4057 	if (inp == dlil_main_input_thread) {
4058 		notify = dlil_input_stats_sync(ifp, inp);
4059 	}
4060 
4061 	dlil_input_wakeup(inp);
4062 	lck_mtx_unlock(&inp->dlth_lock);
4063 
4064 	if (notify) {
4065 		ifnet_notify_data_threshold(ifp);
4066 	}
4067 
4068 	return 0;
4069 }
4070 
4071 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4072 dlil_input_sync(struct dlil_threading_info *inp,
4073     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4074     const struct ifnet_stat_increment_param *s, boolean_t poll,
4075     struct thread *tp)
4076 {
4077 #pragma unused(tp)
4078 	u_int32_t m_cnt = s->packets_in;
4079 	u_int32_t m_size = s->bytes_in;
4080 	boolean_t notify = FALSE;
4081 	classq_pkt_t head, tail;
4082 
4083 	ASSERT(inp != dlil_main_input_thread);
4084 
4085 	/* XXX: should we just assert instead? */
4086 	if (__improbable(m_head == NULL)) {
4087 		return 0;
4088 	}
4089 
4090 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4091 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4092 
4093 	lck_mtx_lock_spin(&inp->dlth_lock);
4094 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4095 
4096 #if IFNET_INPUT_SANITY_CHK
4097 	if (__improbable(dlil_input_sanity_check != 0)) {
4098 		u_int32_t count = 0, size = 0;
4099 		struct mbuf *m0;
4100 
4101 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4102 			size += m_length(m0);
4103 			count++;
4104 		}
4105 
4106 		if (count != m_cnt) {
4107 			panic_plain("%s: invalid total packet count %u "
4108 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4109 			/* NOTREACHED */
4110 			__builtin_unreachable();
4111 		} else if (size != m_size) {
4112 			panic_plain("%s: invalid total packet size %u "
4113 			    "(expected %u)\n", if_name(ifp), size, m_size);
4114 			/* NOTREACHED */
4115 			__builtin_unreachable();
4116 		}
4117 
4118 		inp->dlth_pkts_cnt += m_cnt;
4119 	}
4120 #endif /* IFNET_INPUT_SANITY_CHK */
4121 
4122 	dlil_input_stats_add(s, inp, ifp, poll);
4123 
4124 	m_cnt = qlen(&inp->dlth_pkts);
4125 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4126 
4127 #if SKYWALK
4128 	/*
4129 	 * If this interface is attached to a netif nexus,
4130 	 * the stats are already incremented there; otherwise
4131 	 * do it here.
4132 	 */
4133 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4134 #endif /* SKYWALK */
4135 	notify = dlil_input_stats_sync(ifp, inp);
4136 
4137 	lck_mtx_unlock(&inp->dlth_lock);
4138 
4139 	if (notify) {
4140 		ifnet_notify_data_threshold(ifp);
4141 	}
4142 
4143 	/*
4144 	 * NOTE warning %%% attention !!!!
4145 	 * We should think about putting some thread starvation
4146 	 * safeguards if we deal with long chains of packets.
4147 	 */
4148 	if (head.cp_mbuf != NULL) {
4149 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4150 		    m_cnt, ifp->if_poll_mode);
4151 	}
4152 
4153 	return 0;
4154 }
4155 
4156 #if SKYWALK
4157 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4158 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4159 {
4160 	return atomic_test_set_ptr(&ifp->if_output,
4161 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4162 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4163 }
4164 
4165 void
ifnet_reset_output_handler(struct ifnet * ifp)4166 ifnet_reset_output_handler(struct ifnet *ifp)
4167 {
4168 	while (!atomic_test_set_ptr(&ifp->if_output,
4169 	    ptrauth_nop_cast(void *, ifp->if_output),
4170 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4171 		;
4172 	}
4173 }
4174 
4175 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4176 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4177 {
4178 	return atomic_test_set_ptr(&ifp->if_start,
4179 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4180 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4181 }
4182 
4183 void
ifnet_reset_start_handler(struct ifnet * ifp)4184 ifnet_reset_start_handler(struct ifnet *ifp)
4185 {
4186 	while (!atomic_test_set_ptr(&ifp->if_start,
4187 	    ptrauth_nop_cast(void *, ifp->if_start),
4188 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4189 		;
4190 	}
4191 }
4192 #endif /* SKYWALK */
4193 
4194 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4195 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4196 {
4197 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4198 		return;
4199 	}
4200 	/*
4201 	 * If the starter thread is inactive, signal it to do work,
4202 	 * unless the interface is being flow controlled from below,
4203 	 * e.g. a virtual interface being flow controlled by a real
4204 	 * network interface beneath it, or it's been disabled via
4205 	 * a call to ifnet_disable_output().
4206 	 */
4207 	lck_mtx_lock_spin(&ifp->if_start_lock);
4208 	if (resetfc) {
4209 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4210 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4211 		lck_mtx_unlock(&ifp->if_start_lock);
4212 		return;
4213 	}
4214 	ifp->if_start_req++;
4215 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4216 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4217 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4218 	    ifp->if_start_delayed == 0)) {
4219 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4220 	}
4221 	lck_mtx_unlock(&ifp->if_start_lock);
4222 }
4223 
4224 void
ifnet_start(struct ifnet * ifp)4225 ifnet_start(struct ifnet *ifp)
4226 {
4227 	ifnet_start_common(ifp, FALSE);
4228 }
4229 
4230 __attribute__((noreturn))
4231 static void
ifnet_start_thread_func(void * v,wait_result_t w)4232 ifnet_start_thread_func(void *v, wait_result_t w)
4233 {
4234 #pragma unused(w)
4235 	struct ifnet *ifp = v;
4236 	char thread_name[MAXTHREADNAMESIZE];
4237 
4238 	/* Construct the name for this thread, and then apply it. */
4239 	bzero(thread_name, sizeof(thread_name));
4240 	(void) snprintf(thread_name, sizeof(thread_name),
4241 	    "ifnet_start_%s", ifp->if_xname);
4242 #if SKYWALK
4243 	/* override name for native Skywalk interface */
4244 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4245 		(void) snprintf(thread_name, sizeof(thread_name),
4246 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4247 	}
4248 #endif /* SKYWALK */
4249 	ASSERT(ifp->if_start_thread == current_thread());
4250 	thread_set_thread_name(current_thread(), thread_name);
4251 
4252 	/*
4253 	 * Treat the dedicated starter thread for lo0 as equivalent to
4254 	 * the driver workloop thread; if net_affinity is enabled for
4255 	 * the main input thread, associate this starter thread to it
4256 	 * by binding them with the same affinity tag.  This is done
4257 	 * only once (as we only have one lo_ifp which never goes away.)
4258 	 */
4259 	if (ifp == lo_ifp) {
4260 		struct dlil_threading_info *inp = dlil_main_input_thread;
4261 		struct thread *tp = current_thread();
4262 #if SKYWALK
4263 		/* native skywalk loopback not yet implemented */
4264 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4265 #endif /* SKYWALK */
4266 
4267 		lck_mtx_lock(&inp->dlth_lock);
4268 		if (inp->dlth_affinity) {
4269 			u_int32_t tag = inp->dlth_affinity_tag;
4270 
4271 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4272 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 			inp->dlth_driver_thread = tp;
4274 			lck_mtx_unlock(&inp->dlth_lock);
4275 
4276 			/* Associate this thread with the affinity tag */
4277 			(void) dlil_affinity_set(tp, tag);
4278 		} else {
4279 			lck_mtx_unlock(&inp->dlth_lock);
4280 		}
4281 	}
4282 
4283 	lck_mtx_lock(&ifp->if_start_lock);
4284 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4285 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4286 	ifp->if_start_embryonic = 1;
4287 	/* wake up once to get out of embryonic state */
4288 	ifp->if_start_req++;
4289 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4290 	lck_mtx_unlock(&ifp->if_start_lock);
4291 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4292 	/* NOTREACHED */
4293 	__builtin_unreachable();
4294 }
4295 
4296 __attribute__((noreturn))
4297 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4298 ifnet_start_thread_cont(void *v, wait_result_t wres)
4299 {
4300 	struct ifnet *ifp = v;
4301 	struct ifclassq *ifq = ifp->if_snd;
4302 
4303 	lck_mtx_lock_spin(&ifp->if_start_lock);
4304 	if (__improbable(wres == THREAD_INTERRUPTED ||
4305 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4306 		goto terminate;
4307 	}
4308 
4309 	if (__improbable(ifp->if_start_embryonic)) {
4310 		ifp->if_start_embryonic = 0;
4311 		lck_mtx_unlock(&ifp->if_start_lock);
4312 		ifnet_decr_pending_thread_count(ifp);
4313 		lck_mtx_lock_spin(&ifp->if_start_lock);
4314 		goto skip;
4315 	}
4316 
4317 	ifp->if_start_active = 1;
4318 
4319 	/*
4320 	 * Keep on servicing until no more request.
4321 	 */
4322 	for (;;) {
4323 		u_int32_t req = ifp->if_start_req;
4324 		if (!IFCQ_IS_EMPTY(ifq) &&
4325 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4326 		    ifp->if_start_delayed == 0 &&
4327 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4328 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4329 			ifp->if_start_delayed = 1;
4330 			ifnet_start_delayed++;
4331 			break;
4332 		}
4333 		ifp->if_start_delayed = 0;
4334 		lck_mtx_unlock(&ifp->if_start_lock);
4335 
4336 		/*
4337 		 * If no longer attached, don't call start because ifp
4338 		 * is being destroyed; else hold an IO refcnt to
4339 		 * prevent the interface from being detached (will be
4340 		 * released below.)
4341 		 */
4342 		if (!ifnet_datamov_begin(ifp)) {
4343 			lck_mtx_lock_spin(&ifp->if_start_lock);
4344 			break;
4345 		}
4346 
4347 		/* invoke the driver's start routine */
4348 		((*ifp->if_start)(ifp));
4349 
4350 		/*
4351 		 * Release the io ref count taken above.
4352 		 */
4353 		ifnet_datamov_end(ifp);
4354 
4355 		lck_mtx_lock_spin(&ifp->if_start_lock);
4356 
4357 		/*
4358 		 * If there's no pending request or if the
4359 		 * interface has been disabled, we're done.
4360 		 */
4361 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4362 		if (req == ifp->if_start_req ||
4363 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4364 			break;
4365 		}
4366 	}
4367 skip:
4368 	ifp->if_start_req = 0;
4369 	ifp->if_start_active = 0;
4370 
4371 #if SKYWALK
4372 	/*
4373 	 * Wakeup any waiters, e.g. any threads waiting to
4374 	 * detach the interface from the flowswitch, etc.
4375 	 */
4376 	if (ifp->if_start_waiters != 0) {
4377 		ifp->if_start_waiters = 0;
4378 		wakeup(&ifp->if_start_waiters);
4379 	}
4380 #endif /* SKYWALK */
4381 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4382 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4383 		struct timespec delay_start_ts;
4384 		struct timespec *ts;
4385 
4386 		/*
4387 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4388 		 * there are still packets in the send queue which haven't
4389 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4390 		 * until ifnet_start() is called again.
4391 		 */
4392 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4393 		    &ifp->if_start_cycle : NULL);
4394 
4395 		if (ts == NULL && ifp->if_start_delayed == 1) {
4396 			delay_start_ts.tv_sec = 0;
4397 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4398 			ts = &delay_start_ts;
4399 		}
4400 
4401 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4402 			ts = NULL;
4403 		}
4404 
4405 		if (__improbable(ts != NULL)) {
4406 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4407 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4408 		}
4409 
4410 		(void) assert_wait_deadline(&ifp->if_start_thread,
4411 		    THREAD_UNINT, deadline);
4412 		lck_mtx_unlock(&ifp->if_start_lock);
4413 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4414 		/* NOTREACHED */
4415 	} else {
4416 terminate:
4417 		/* interface is detached? */
4418 		ifnet_set_start_cycle(ifp, NULL);
4419 
4420 		/* clear if_start_thread to allow termination to continue */
4421 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4422 		ifp->if_start_thread = THREAD_NULL;
4423 		wakeup((caddr_t)&ifp->if_start_thread);
4424 		lck_mtx_unlock(&ifp->if_start_lock);
4425 
4426 		if (dlil_verbose) {
4427 			DLIL_PRINTF("%s: starter thread terminated\n",
4428 			    if_name(ifp));
4429 		}
4430 
4431 		/* for the extra refcnt from kernel_thread_start() */
4432 		thread_deallocate(current_thread());
4433 		/* this is the end */
4434 		thread_terminate(current_thread());
4435 		/* NOTREACHED */
4436 	}
4437 
4438 	/* must never get here */
4439 	VERIFY(0);
4440 	/* NOTREACHED */
4441 	__builtin_unreachable();
4442 }
4443 
4444 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4445 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4446 {
4447 	if (ts == NULL) {
4448 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4449 	} else {
4450 		*(&ifp->if_start_cycle) = *ts;
4451 	}
4452 
4453 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4454 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4455 		    if_name(ifp), ts->tv_nsec);
4456 	}
4457 }
4458 
4459 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4460 ifnet_poll_wakeup(struct ifnet *ifp)
4461 {
4462 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4463 
4464 	ifp->if_poll_req++;
4465 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4466 	    ifp->if_poll_thread != THREAD_NULL) {
4467 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4468 	}
4469 }
4470 
4471 void
ifnet_poll(struct ifnet * ifp)4472 ifnet_poll(struct ifnet *ifp)
4473 {
4474 	/*
4475 	 * If the poller thread is inactive, signal it to do work.
4476 	 */
4477 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4478 	ifnet_poll_wakeup(ifp);
4479 	lck_mtx_unlock(&ifp->if_poll_lock);
4480 }
4481 
4482 __attribute__((noreturn))
4483 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4484 ifnet_poll_thread_func(void *v, wait_result_t w)
4485 {
4486 #pragma unused(w)
4487 	char thread_name[MAXTHREADNAMESIZE];
4488 	struct ifnet *ifp = v;
4489 
4490 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4491 	VERIFY(current_thread() == ifp->if_poll_thread);
4492 
4493 	/* construct the name for this thread, and then apply it */
4494 	bzero(thread_name, sizeof(thread_name));
4495 	(void) snprintf(thread_name, sizeof(thread_name),
4496 	    "ifnet_poller_%s", ifp->if_xname);
4497 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4498 
4499 	lck_mtx_lock(&ifp->if_poll_lock);
4500 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4501 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4502 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4503 	/* wake up once to get out of embryonic state */
4504 	ifnet_poll_wakeup(ifp);
4505 	lck_mtx_unlock(&ifp->if_poll_lock);
4506 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4507 	/* NOTREACHED */
4508 	__builtin_unreachable();
4509 }
4510 
4511 __attribute__((noreturn))
4512 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4513 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4514 {
4515 	struct dlil_threading_info *inp;
4516 	struct ifnet *ifp = v;
4517 	struct ifnet_stat_increment_param s;
4518 	struct timespec start_time;
4519 
4520 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4521 
4522 	bzero(&s, sizeof(s));
4523 	net_timerclear(&start_time);
4524 
4525 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4526 	if (__improbable(wres == THREAD_INTERRUPTED ||
4527 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4528 		goto terminate;
4529 	}
4530 
4531 	inp = ifp->if_inp;
4532 	VERIFY(inp != NULL);
4533 
4534 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4535 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4536 		lck_mtx_unlock(&ifp->if_poll_lock);
4537 		ifnet_decr_pending_thread_count(ifp);
4538 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4539 		goto skip;
4540 	}
4541 
4542 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4543 
4544 	/*
4545 	 * Keep on servicing until no more request.
4546 	 */
4547 	for (;;) {
4548 		struct mbuf *m_head, *m_tail;
4549 		u_int32_t m_lim, m_cnt, m_totlen;
4550 		u_int16_t req = ifp->if_poll_req;
4551 
4552 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4553 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4554 		lck_mtx_unlock(&ifp->if_poll_lock);
4555 
4556 		/*
4557 		 * If no longer attached, there's nothing to do;
4558 		 * else hold an IO refcnt to prevent the interface
4559 		 * from being detached (will be released below.)
4560 		 */
4561 		if (!ifnet_is_attached(ifp, 1)) {
4562 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4563 			break;
4564 		}
4565 
4566 		if (dlil_verbose > 1) {
4567 			DLIL_PRINTF("%s: polling up to %d pkts, "
4568 			    "pkts avg %d max %d, wreq avg %d, "
4569 			    "bytes avg %d\n",
4570 			    if_name(ifp), m_lim,
4571 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4572 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4573 		}
4574 
4575 		/* invoke the driver's input poll routine */
4576 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4577 		&m_cnt, &m_totlen));
4578 
4579 		if (m_head != NULL) {
4580 			VERIFY(m_tail != NULL && m_cnt > 0);
4581 
4582 			if (dlil_verbose > 1) {
4583 				DLIL_PRINTF("%s: polled %d pkts, "
4584 				    "pkts avg %d max %d, wreq avg %d, "
4585 				    "bytes avg %d\n",
4586 				    if_name(ifp), m_cnt,
4587 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4588 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4589 			}
4590 
4591 			/* stats are required for extended variant */
4592 			s.packets_in = m_cnt;
4593 			s.bytes_in = m_totlen;
4594 
4595 			(void) ifnet_input_common(ifp, m_head, m_tail,
4596 			    &s, TRUE, TRUE);
4597 		} else {
4598 			if (dlil_verbose > 1) {
4599 				DLIL_PRINTF("%s: no packets, "
4600 				    "pkts avg %d max %d, wreq avg %d, "
4601 				    "bytes avg %d\n",
4602 				    if_name(ifp), ifp->if_rxpoll_pavg,
4603 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4604 				    ifp->if_rxpoll_bavg);
4605 			}
4606 
4607 			(void) ifnet_input_common(ifp, NULL, NULL,
4608 			    NULL, FALSE, TRUE);
4609 		}
4610 
4611 		/* Release the io ref count */
4612 		ifnet_decr_iorefcnt(ifp);
4613 
4614 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4615 
4616 		/* if there's no pending request, we're done */
4617 		if (req == ifp->if_poll_req ||
4618 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4619 			break;
4620 		}
4621 	}
4622 skip:
4623 	ifp->if_poll_req = 0;
4624 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4625 
4626 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4627 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4628 		struct timespec *ts;
4629 
4630 		/*
4631 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4632 		 * until ifnet_poll() is called again.
4633 		 */
4634 		ts = &ifp->if_poll_cycle;
4635 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4636 			ts = NULL;
4637 		}
4638 
4639 		if (ts != NULL) {
4640 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4641 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4642 		}
4643 
4644 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4645 		    THREAD_UNINT, deadline);
4646 		lck_mtx_unlock(&ifp->if_poll_lock);
4647 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4648 		/* NOTREACHED */
4649 	} else {
4650 terminate:
4651 		/* interface is detached (maybe while asleep)? */
4652 		ifnet_set_poll_cycle(ifp, NULL);
4653 
4654 		/* clear if_poll_thread to allow termination to continue */
4655 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4656 		ifp->if_poll_thread = THREAD_NULL;
4657 		wakeup((caddr_t)&ifp->if_poll_thread);
4658 		lck_mtx_unlock(&ifp->if_poll_lock);
4659 
4660 		if (dlil_verbose) {
4661 			DLIL_PRINTF("%s: poller thread terminated\n",
4662 			    if_name(ifp));
4663 		}
4664 
4665 		/* for the extra refcnt from kernel_thread_start() */
4666 		thread_deallocate(current_thread());
4667 		/* this is the end */
4668 		thread_terminate(current_thread());
4669 		/* NOTREACHED */
4670 	}
4671 
4672 	/* must never get here */
4673 	VERIFY(0);
4674 	/* NOTREACHED */
4675 	__builtin_unreachable();
4676 }
4677 
4678 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4679 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4680 {
4681 	if (ts == NULL) {
4682 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4683 	} else {
4684 		*(&ifp->if_poll_cycle) = *ts;
4685 	}
4686 
4687 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4688 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4689 		    if_name(ifp), ts->tv_nsec);
4690 	}
4691 }
4692 
4693 void
ifnet_purge(struct ifnet * ifp)4694 ifnet_purge(struct ifnet *ifp)
4695 {
4696 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4697 		if_qflush_snd(ifp, false);
4698 	}
4699 }
4700 
4701 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4702 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4703 {
4704 	IFCQ_LOCK_ASSERT_HELD(ifq);
4705 
4706 	if (!(IFCQ_IS_READY(ifq))) {
4707 		return;
4708 	}
4709 
4710 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4711 		struct tb_profile tb = {
4712 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4713 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4714 		};
4715 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4716 	}
4717 
4718 	ifclassq_update(ifq, ev);
4719 }
4720 
4721 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4722 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4723 {
4724 	switch (ev) {
4725 	case CLASSQ_EV_LINK_BANDWIDTH:
4726 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4727 			ifp->if_poll_update++;
4728 		}
4729 		break;
4730 
4731 	default:
4732 		break;
4733 	}
4734 }
4735 
4736 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4737 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4738 {
4739 	struct ifclassq *ifq;
4740 	u_int32_t omodel;
4741 	errno_t err;
4742 
4743 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4744 		return EINVAL;
4745 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4746 		return ENXIO;
4747 	}
4748 
4749 	ifq = ifp->if_snd;
4750 	IFCQ_LOCK(ifq);
4751 	omodel = ifp->if_output_sched_model;
4752 	ifp->if_output_sched_model = model;
4753 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4754 		ifp->if_output_sched_model = omodel;
4755 	}
4756 	IFCQ_UNLOCK(ifq);
4757 
4758 	return err;
4759 }
4760 
4761 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4762 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4763 {
4764 	if (ifp == NULL) {
4765 		return EINVAL;
4766 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4767 		return ENXIO;
4768 	}
4769 
4770 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4771 
4772 	return 0;
4773 }
4774 
4775 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4776 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4777 {
4778 	if (ifp == NULL || maxqlen == NULL) {
4779 		return EINVAL;
4780 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4781 		return ENXIO;
4782 	}
4783 
4784 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4785 
4786 	return 0;
4787 }
4788 
4789 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4790 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4791 {
4792 	errno_t err;
4793 
4794 	if (ifp == NULL || pkts == NULL) {
4795 		err = EINVAL;
4796 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4797 		err = ENXIO;
4798 	} else {
4799 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4800 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4801 	}
4802 
4803 	return err;
4804 }
4805 
4806 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4807 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4808     u_int32_t *pkts, u_int32_t *bytes)
4809 {
4810 	errno_t err;
4811 
4812 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4813 	    (pkts == NULL && bytes == NULL)) {
4814 		err = EINVAL;
4815 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4816 		err = ENXIO;
4817 	} else {
4818 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4819 		    pkts, bytes);
4820 	}
4821 
4822 	return err;
4823 }
4824 
4825 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4826 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4827 {
4828 	struct dlil_threading_info *inp;
4829 
4830 	if (ifp == NULL) {
4831 		return EINVAL;
4832 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4833 		return ENXIO;
4834 	}
4835 
4836 	if (maxqlen == 0) {
4837 		maxqlen = if_rcvq_maxlen;
4838 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4839 		maxqlen = IF_RCVQ_MINLEN;
4840 	}
4841 
4842 	inp = ifp->if_inp;
4843 	lck_mtx_lock(&inp->dlth_lock);
4844 	qlimit(&inp->dlth_pkts) = maxqlen;
4845 	lck_mtx_unlock(&inp->dlth_lock);
4846 
4847 	return 0;
4848 }
4849 
4850 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4851 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4852 {
4853 	struct dlil_threading_info *inp;
4854 
4855 	if (ifp == NULL || maxqlen == NULL) {
4856 		return EINVAL;
4857 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4858 		return ENXIO;
4859 	}
4860 
4861 	inp = ifp->if_inp;
4862 	lck_mtx_lock(&inp->dlth_lock);
4863 	*maxqlen = qlimit(&inp->dlth_pkts);
4864 	lck_mtx_unlock(&inp->dlth_lock);
4865 	return 0;
4866 }
4867 
4868 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4869 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4870     uint16_t delay_timeout)
4871 {
4872 	if (delay_qlen > 0 && delay_timeout > 0) {
4873 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4874 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4875 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4876 		/* convert timeout to nanoseconds */
4877 		ifp->if_start_delay_timeout *= 1000;
4878 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4879 		    ifp->if_xname, (uint32_t)delay_qlen,
4880 		    (uint32_t)delay_timeout);
4881 	} else {
4882 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4883 	}
4884 }
4885 
4886 /*
4887  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4888  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4889  * buf holds the full header.
4890  */
4891 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4892 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4893 {
4894 	struct ip *ip;
4895 	struct ip6_hdr *ip6;
4896 	uint8_t lbuf[64] __attribute__((aligned(8)));
4897 	uint8_t *p = buf;
4898 
4899 	if (ip_ver == IPVERSION) {
4900 		uint8_t old_tos;
4901 		uint32_t sum;
4902 
4903 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4904 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4905 			bcopy(buf, lbuf, sizeof(struct ip));
4906 			p = lbuf;
4907 		}
4908 		ip = (struct ip *)(void *)p;
4909 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4910 			return;
4911 		}
4912 
4913 		DTRACE_IP1(clear__v4, struct ip *, ip);
4914 		old_tos = ip->ip_tos;
4915 		ip->ip_tos &= IPTOS_ECN_MASK;
4916 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4917 		sum = (sum >> 16) + (sum & 0xffff);
4918 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4919 
4920 		if (__improbable(p == lbuf)) {
4921 			bcopy(lbuf, buf, sizeof(struct ip));
4922 		}
4923 	} else {
4924 		uint32_t flow;
4925 		ASSERT(ip_ver == IPV6_VERSION);
4926 
4927 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4928 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4929 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4930 			p = lbuf;
4931 		}
4932 		ip6 = (struct ip6_hdr *)(void *)p;
4933 		flow = ntohl(ip6->ip6_flow);
4934 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4935 			return;
4936 		}
4937 
4938 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4939 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4940 
4941 		if (__improbable(p == lbuf)) {
4942 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4943 		}
4944 	}
4945 }
4946 
4947 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4948 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4949     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4950 {
4951 #if SKYWALK
4952 	volatile struct sk_nexusadv *nxadv = NULL;
4953 #endif /* SKYWALK */
4954 	volatile uint64_t *fg_ts = NULL;
4955 	volatile uint64_t *rt_ts = NULL;
4956 	struct timespec now;
4957 	u_int64_t now_nsec = 0;
4958 	int error = 0;
4959 	uint8_t *mcast_buf = NULL;
4960 	uint8_t ip_ver;
4961 	uint32_t pktlen;
4962 
4963 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4964 #if SKYWALK
4965 	/*
4966 	 * If attached to flowswitch, grab pointers to the
4967 	 * timestamp variables in the nexus advisory region.
4968 	 */
4969 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4970 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4971 		fg_ts = &nxadv->nxadv_fg_sendts;
4972 		rt_ts = &nxadv->nxadv_rt_sendts;
4973 	}
4974 #endif /* SKYWALK */
4975 
4976 	/*
4977 	 * If packet already carries a timestamp, either from dlil_output()
4978 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4979 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4980 	 * the timestamp value is used internally there.
4981 	 */
4982 	switch (p->cp_ptype) {
4983 	case QP_MBUF:
4984 #if SKYWALK
4985 		/*
4986 		 * Valid only for non-native (compat) Skywalk interface.
4987 		 * If the data source uses packet, caller must convert
4988 		 * it to mbuf first prior to calling this routine.
4989 		 */
4990 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4991 #endif /* SKYWALK */
4992 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4993 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4994 
4995 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4996 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4997 			nanouptime(&now);
4998 			net_timernsec(&now, &now_nsec);
4999 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5000 		}
5001 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5002 		/*
5003 		 * If the packet service class is not background,
5004 		 * update the timestamp to indicate recent activity
5005 		 * on a foreground socket.
5006 		 */
5007 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5008 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5009 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5010 			    PKTF_SO_BACKGROUND)) {
5011 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5012 				if (fg_ts != NULL) {
5013 					*fg_ts = (uint32_t)_net_uptime;
5014 				}
5015 			}
5016 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5017 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5018 				if (rt_ts != NULL) {
5019 					*rt_ts = (uint32_t)_net_uptime;
5020 				}
5021 			}
5022 		}
5023 		pktlen = m_pktlen(p->cp_mbuf);
5024 
5025 		/*
5026 		 * Some Wi-Fi AP implementations do not correctly handle
5027 		 * multicast IP packets with DSCP bits set (radr://9331522).
5028 		 * As a workaround we clear the DSCP bits but keep service
5029 		 * class (rdar://51507725).
5030 		 */
5031 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5032 		    IFNET_IS_WIFI_INFRA(ifp)) {
5033 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5034 			struct ether_header *eh;
5035 			boolean_t pullup = FALSE;
5036 			uint16_t etype;
5037 
5038 			if (__improbable(len < sizeof(struct ether_header))) {
5039 				DTRACE_IP1(small__ether, size_t, len);
5040 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5041 				    sizeof(struct ether_header))) == NULL) {
5042 					return ENOMEM;
5043 				}
5044 			}
5045 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5046 			etype = ntohs(eh->ether_type);
5047 			if (etype == ETHERTYPE_IP) {
5048 				hlen = sizeof(struct ether_header) +
5049 				    sizeof(struct ip);
5050 				if (len < hlen) {
5051 					DTRACE_IP1(small__v4, size_t, len);
5052 					pullup = TRUE;
5053 				}
5054 				ip_ver = IPVERSION;
5055 			} else if (etype == ETHERTYPE_IPV6) {
5056 				hlen = sizeof(struct ether_header) +
5057 				    sizeof(struct ip6_hdr);
5058 				if (len < hlen) {
5059 					DTRACE_IP1(small__v6, size_t, len);
5060 					pullup = TRUE;
5061 				}
5062 				ip_ver = IPV6_VERSION;
5063 			} else {
5064 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5065 				break;
5066 			}
5067 			if (pullup) {
5068 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5069 				    NULL) {
5070 					return ENOMEM;
5071 				}
5072 
5073 				eh = (struct ether_header *)mbuf_data(
5074 					p->cp_mbuf);
5075 			}
5076 			mcast_buf = (uint8_t *)(eh + 1);
5077 			/*
5078 			 * ifnet_mcast_clear_dscp() will finish the work below.
5079 			 * Note that the pullups above ensure that mcast_buf
5080 			 * points to a full IP header.
5081 			 */
5082 		}
5083 		break;
5084 
5085 #if SKYWALK
5086 	case QP_PACKET:
5087 		/*
5088 		 * Valid only for native Skywalk interface.  If the data
5089 		 * source uses mbuf, caller must convert it to packet first
5090 		 * prior to calling this routine.
5091 		 */
5092 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5093 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5094 		    p->cp_kpkt->pkt_timestamp == 0) {
5095 			nanouptime(&now);
5096 			net_timernsec(&now, &now_nsec);
5097 			p->cp_kpkt->pkt_timestamp = now_nsec;
5098 		}
5099 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5100 		/*
5101 		 * If the packet service class is not background,
5102 		 * update the timestamps on the interface, as well as
5103 		 * the ones in nexus-wide advisory to indicate recent
5104 		 * activity on a foreground flow.
5105 		 */
5106 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5107 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5108 			if (fg_ts != NULL) {
5109 				*fg_ts = (uint32_t)_net_uptime;
5110 			}
5111 		}
5112 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5113 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5114 			if (rt_ts != NULL) {
5115 				*rt_ts = (uint32_t)_net_uptime;
5116 			}
5117 		}
5118 		pktlen = p->cp_kpkt->pkt_length;
5119 
5120 		/*
5121 		 * Some Wi-Fi AP implementations do not correctly handle
5122 		 * multicast IP packets with DSCP bits set (radr://9331522).
5123 		 * As a workaround we clear the DSCP bits but keep service
5124 		 * class (rdar://51507725).
5125 		 */
5126 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5127 		    IFNET_IS_WIFI_INFRA(ifp)) {
5128 			uint8_t *baddr;
5129 			struct ether_header *eh;
5130 			uint16_t etype;
5131 
5132 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5133 			baddr += p->cp_kpkt->pkt_headroom;
5134 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5135 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5136 				    p->cp_kpkt);
5137 				break;
5138 			}
5139 			eh = (struct ether_header *)(void *)baddr;
5140 			etype = ntohs(eh->ether_type);
5141 			if (etype == ETHERTYPE_IP) {
5142 				if (pktlen < sizeof(struct ether_header) +
5143 				    sizeof(struct ip)) {
5144 					DTRACE_IP1(pkt__small__v4, uint32_t,
5145 					    pktlen);
5146 					break;
5147 				}
5148 				ip_ver = IPVERSION;
5149 			} else if (etype == ETHERTYPE_IPV6) {
5150 				if (pktlen < sizeof(struct ether_header) +
5151 				    sizeof(struct ip6_hdr)) {
5152 					DTRACE_IP1(pkt__small__v6, uint32_t,
5153 					    pktlen);
5154 					break;
5155 				}
5156 				ip_ver = IPV6_VERSION;
5157 			} else {
5158 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5159 				    etype);
5160 				break;
5161 			}
5162 			mcast_buf = (uint8_t *)(eh + 1);
5163 			/*
5164 			 * ifnet_mcast_clear_dscp() will finish the work below.
5165 			 * The checks above verify that the IP header is in the
5166 			 * first buflet.
5167 			 */
5168 		}
5169 		break;
5170 #endif /* SKYWALK */
5171 
5172 	default:
5173 		VERIFY(0);
5174 		/* NOTREACHED */
5175 		__builtin_unreachable();
5176 	}
5177 
5178 	if (mcast_buf != NULL) {
5179 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5180 	}
5181 
5182 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5183 		if (now_nsec == 0) {
5184 			nanouptime(&now);
5185 			net_timernsec(&now, &now_nsec);
5186 		}
5187 		/*
5188 		 * If the driver chose to delay start callback for
5189 		 * coalescing multiple packets, Then use the following
5190 		 * heuristics to make sure that start callback will
5191 		 * be delayed only when bulk data transfer is detected.
5192 		 * 1. number of packets enqueued in (delay_win * 2) is
5193 		 * greater than or equal to the delay qlen.
5194 		 * 2. If delay_start is enabled it will stay enabled for
5195 		 * another 10 idle windows. This is to take into account
5196 		 * variable RTT and burst traffic.
5197 		 * 3. If the time elapsed since last enqueue is more
5198 		 * than 200ms we disable delaying start callback. This is
5199 		 * is to take idle time into account.
5200 		 */
5201 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5202 		if (ifp->if_start_delay_swin > 0) {
5203 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5204 				ifp->if_start_delay_cnt++;
5205 			} else if ((now_nsec - ifp->if_start_delay_swin)
5206 			    >= (200 * 1000 * 1000)) {
5207 				ifp->if_start_delay_swin = now_nsec;
5208 				ifp->if_start_delay_cnt = 1;
5209 				ifp->if_start_delay_idle = 0;
5210 				if (ifp->if_eflags & IFEF_DELAY_START) {
5211 					if_clear_eflags(ifp, IFEF_DELAY_START);
5212 					ifnet_delay_start_disabled_increment();
5213 				}
5214 			} else {
5215 				if (ifp->if_start_delay_cnt >=
5216 				    ifp->if_start_delay_qlen) {
5217 					if_set_eflags(ifp, IFEF_DELAY_START);
5218 					ifp->if_start_delay_idle = 0;
5219 				} else {
5220 					if (ifp->if_start_delay_idle >= 10) {
5221 						if_clear_eflags(ifp,
5222 						    IFEF_DELAY_START);
5223 						ifnet_delay_start_disabled_increment();
5224 					} else {
5225 						ifp->if_start_delay_idle++;
5226 					}
5227 				}
5228 				ifp->if_start_delay_swin = now_nsec;
5229 				ifp->if_start_delay_cnt = 1;
5230 			}
5231 		} else {
5232 			ifp->if_start_delay_swin = now_nsec;
5233 			ifp->if_start_delay_cnt = 1;
5234 			ifp->if_start_delay_idle = 0;
5235 			if_clear_eflags(ifp, IFEF_DELAY_START);
5236 		}
5237 	} else {
5238 		if_clear_eflags(ifp, IFEF_DELAY_START);
5239 	}
5240 
5241 	/* enqueue the packet (caller consumes object) */
5242 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5243 	    1, pktlen, pdrop);
5244 
5245 	/*
5246 	 * Tell the driver to start dequeueing; do this even when the queue
5247 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5248 	 * be dequeueing from other unsuspended queues.
5249 	 */
5250 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5251 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5252 		ifnet_start(ifp);
5253 	}
5254 
5255 	return error;
5256 }
5257 
5258 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5259 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5260     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5261     boolean_t flush, boolean_t *pdrop)
5262 {
5263 	int error;
5264 
5265 	/* enqueue the packet (caller consumes object) */
5266 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5267 	    cnt, bytes, pdrop);
5268 
5269 	/*
5270 	 * Tell the driver to start dequeueing; do this even when the queue
5271 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5272 	 * be dequeueing from other unsuspended queues.
5273 	 */
5274 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5275 		ifnet_start(ifp);
5276 	}
5277 	return error;
5278 }
5279 
5280 #if DEVELOPMENT || DEBUG
5281 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5282 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5283 {
5284 #define MIN_TRACE_DUMP_PKT_SIZE  32
5285 	struct ether_header *eh = NULL;
5286 	struct udphdr *uh = NULL;
5287 
5288 	if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5289 	    flow_key_trace.fk_ipver != IPV6_VERSION))) {
5290 		return;
5291 	}
5292 
5293 	uint16_t bdlim, bdlen, bdoff;
5294 	uint8_t *baddr;
5295 
5296 	MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5297 
5298 	if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5299 		if (!IFNET_IS_ETHERNET(ifp)) {
5300 			return;
5301 		}
5302 
5303 		sa_family_t af = AF_UNSPEC;
5304 		ASSERT(kpkt->pkt_l2_len > 0);
5305 
5306 		baddr += kpkt->pkt_headroom;
5307 		eh = (struct ether_header *)(void *)baddr;
5308 		if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5309 			return;
5310 		}
5311 		if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5312 			return;
5313 		}
5314 		uint16_t ether_type = ntohs(eh->ether_type);
5315 		if (ether_type == ETHERTYPE_IP) {
5316 			af = AF_INET;
5317 		} else if (ether_type == ETHERTYPE_IPV6) {
5318 			af = AF_INET6;
5319 		} else {
5320 			return;
5321 		}
5322 		flow_pkt_classify(kpkt, ifp, af, input);
5323 	}
5324 
5325 	if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5326 		return;
5327 	}
5328 
5329 	if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5330 		return;
5331 	}
5332 
5333 	uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5334 	uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5335 
5336 	if (kpkt->pkt_flow_udp_src != sport ||
5337 	    kpkt->pkt_flow_udp_dst != dport) {
5338 		return;
5339 	}
5340 
5341 	if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5342 		struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5343 		struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5344 		struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5345 
5346 		if (ip_header->ip_src.s_addr != saddr->s_addr ||
5347 		    ip_header->ip_dst.s_addr != daddr->s_addr) {
5348 			return;
5349 		}
5350 	} else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5351 		struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5352 		struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5353 		struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5354 
5355 		if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5356 		    !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5357 			return;
5358 		}
5359 	}
5360 
5361 	int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5362 
5363 	uint16_t pkt_payload_len = bdlim - bdoff;
5364 	pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5365 	pkt_payload_len -= udp_payload_offset;
5366 
5367 	if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5368 		uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5369 		uint8_t *payload = (uint8_t *)(uh + 1);
5370 
5371 		/* Trace 32 bytes of UDP transport payload */
5372 		uint64_t *trace1 = __DECONST(uint64_t *, payload);
5373 		uint64_t *trace2 = trace1 + 1;
5374 		uint64_t *trace3 = trace2 + 1;
5375 		uint64_t *trace4 = trace3 + 1;
5376 
5377 		if (input) {
5378 			KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5379 		} else {
5380 			KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5381 		}
5382 	}
5383 }
5384 #endif /* DEVELOPMENT || DEBUG */
5385 
5386 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5387 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5388 {
5389 	struct ifnet *ifp = handle;
5390 	boolean_t pdrop;        /* dummy */
5391 	uint32_t i;
5392 
5393 	ASSERT(n_pkts >= 1);
5394 	for (i = 0; i < n_pkts - 1; i++) {
5395 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5396 		    FALSE, &pdrop);
5397 	}
5398 	/* flush with the last packet */
5399 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5400 	    TRUE, &pdrop);
5401 
5402 	return 0;
5403 }
5404 
5405 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5406 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5407     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5408 {
5409 #if DEVELOPMENT || DEBUG
5410 	switch (pkt->cp_ptype) {
5411 	case QP_PACKET: {
5412 		trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5413 		break;
5414 	}
5415 	case QP_MBUF:
5416 	case QP_INVALID: {
5417 		break;
5418 	}
5419 	}
5420 #endif /* DEVELOPMENT || DEBUG */
5421 
5422 	if (ifp->if_output_netem != NULL) {
5423 		bool drop;
5424 		errno_t error;
5425 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5426 		*pdrop = drop ? TRUE : FALSE;
5427 		return error;
5428 	} else {
5429 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5430 	}
5431 }
5432 
5433 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5434 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5435 {
5436 	boolean_t pdrop;
5437 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5438 }
5439 
5440 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5441 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5442     boolean_t *pdrop)
5443 {
5444 	classq_pkt_t pkt;
5445 
5446 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5447 	    m->m_nextpkt != NULL) {
5448 		if (m != NULL) {
5449 			m_freem_list(m);
5450 			*pdrop = TRUE;
5451 		}
5452 		return EINVAL;
5453 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5454 	    !IF_FULLY_ATTACHED(ifp)) {
5455 		/* flag tested without lock for performance */
5456 		m_freem(m);
5457 		*pdrop = TRUE;
5458 		return ENXIO;
5459 	} else if (!(ifp->if_flags & IFF_UP)) {
5460 		m_freem(m);
5461 		*pdrop = TRUE;
5462 		return ENETDOWN;
5463 	}
5464 
5465 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5466 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5467 }
5468 
5469 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5470 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5471     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5472     boolean_t *pdrop)
5473 {
5474 	classq_pkt_t head, tail;
5475 
5476 	ASSERT(m_head != NULL);
5477 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5478 	ASSERT(m_tail != NULL);
5479 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5480 	ASSERT(ifp != NULL);
5481 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5482 
5483 	if (!IF_FULLY_ATTACHED(ifp)) {
5484 		/* flag tested without lock for performance */
5485 		m_freem_list(m_head);
5486 		*pdrop = TRUE;
5487 		return ENXIO;
5488 	} else if (!(ifp->if_flags & IFF_UP)) {
5489 		m_freem_list(m_head);
5490 		*pdrop = TRUE;
5491 		return ENETDOWN;
5492 	}
5493 
5494 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5495 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5496 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5497 	           flush, pdrop);
5498 }
5499 
5500 #if SKYWALK
5501 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5502 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5503     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5504 {
5505 	classq_pkt_t pkt;
5506 
5507 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5508 
5509 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5510 		if (kpkt != NULL) {
5511 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5512 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5513 			*pdrop = TRUE;
5514 		}
5515 		return EINVAL;
5516 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5517 	    !IF_FULLY_ATTACHED(ifp))) {
5518 		/* flag tested without lock for performance */
5519 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5520 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5521 		*pdrop = TRUE;
5522 		return ENXIO;
5523 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5524 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5525 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5526 		*pdrop = TRUE;
5527 		return ENETDOWN;
5528 	}
5529 
5530 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5531 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5532 }
5533 
5534 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5535 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5536     boolean_t flush, boolean_t *pdrop)
5537 {
5538 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5539 }
5540 
5541 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5542 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5543     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5544 {
5545 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5546 }
5547 
5548 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5549 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5550     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5551     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5552 {
5553 	classq_pkt_t head, tail;
5554 
5555 	ASSERT(k_head != NULL);
5556 	ASSERT(k_tail != NULL);
5557 	ASSERT(ifp != NULL);
5558 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5559 
5560 	if (!IF_FULLY_ATTACHED(ifp)) {
5561 		/* flag tested without lock for performance */
5562 		pp_free_packet_chain(k_head, NULL);
5563 		*pdrop = TRUE;
5564 		return ENXIO;
5565 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5566 		pp_free_packet_chain(k_head, NULL);
5567 		*pdrop = TRUE;
5568 		return ENETDOWN;
5569 	}
5570 
5571 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5572 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5573 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5574 	           flush, pdrop);
5575 }
5576 
5577 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5578 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5579     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5580     boolean_t *pdrop)
5581 {
5582 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5583 	           cnt, bytes, flush, pdrop);
5584 }
5585 
5586 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5587 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5588     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5589     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5590 {
5591 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5592 	           cnt, bytes, flush, pdrop);
5593 }
5594 #endif /* SKYWALK */
5595 
5596 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5597 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5598 {
5599 	errno_t rc;
5600 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5601 
5602 	if (ifp == NULL || mp == NULL) {
5603 		return EINVAL;
5604 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5605 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5606 		return ENXIO;
5607 	}
5608 	if (!ifnet_is_attached(ifp, 1)) {
5609 		return ENXIO;
5610 	}
5611 
5612 #if SKYWALK
5613 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5614 #endif /* SKYWALK */
5615 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5616 	    &pkt, NULL, NULL, NULL, 0);
5617 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5618 	ifnet_decr_iorefcnt(ifp);
5619 	*mp = pkt.cp_mbuf;
5620 	return rc;
5621 }
5622 
5623 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5624 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5625     struct mbuf **mp)
5626 {
5627 	errno_t rc;
5628 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5629 
5630 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5631 		return EINVAL;
5632 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5633 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5634 		return ENXIO;
5635 	}
5636 	if (!ifnet_is_attached(ifp, 1)) {
5637 		return ENXIO;
5638 	}
5639 
5640 #if SKYWALK
5641 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5642 #endif /* SKYWALK */
5643 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5644 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5645 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5646 	ifnet_decr_iorefcnt(ifp);
5647 	*mp = pkt.cp_mbuf;
5648 	return rc;
5649 }
5650 
5651 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5652 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5653     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5654 {
5655 	errno_t rc;
5656 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5657 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5658 
5659 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5660 		return EINVAL;
5661 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5662 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5663 		return ENXIO;
5664 	}
5665 	if (!ifnet_is_attached(ifp, 1)) {
5666 		return ENXIO;
5667 	}
5668 
5669 #if SKYWALK
5670 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5671 #endif /* SKYWALK */
5672 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5673 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5674 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5675 	ifnet_decr_iorefcnt(ifp);
5676 	*head = pkt_head.cp_mbuf;
5677 	if (tail != NULL) {
5678 		*tail = pkt_tail.cp_mbuf;
5679 	}
5680 	return rc;
5681 }
5682 
5683 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5684 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5685     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5686 {
5687 	errno_t rc;
5688 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5689 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5690 
5691 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5692 		return EINVAL;
5693 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5694 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5695 		return ENXIO;
5696 	}
5697 	if (!ifnet_is_attached(ifp, 1)) {
5698 		return ENXIO;
5699 	}
5700 
5701 #if SKYWALK
5702 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5703 #endif /* SKYWALK */
5704 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5705 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5706 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5707 	ifnet_decr_iorefcnt(ifp);
5708 	*head = pkt_head.cp_mbuf;
5709 	if (tail != NULL) {
5710 		*tail = pkt_tail.cp_mbuf;
5711 	}
5712 	return rc;
5713 }
5714 
5715 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5716 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5717     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5718     u_int32_t *len)
5719 {
5720 	errno_t rc;
5721 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5722 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5723 
5724 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5725 	    !MBUF_VALID_SC(sc)) {
5726 		return EINVAL;
5727 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5728 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5729 		return ENXIO;
5730 	}
5731 	if (!ifnet_is_attached(ifp, 1)) {
5732 		return ENXIO;
5733 	}
5734 
5735 #if SKYWALK
5736 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5737 #endif /* SKYWALK */
5738 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5739 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5740 	    cnt, len, 0);
5741 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5742 	ifnet_decr_iorefcnt(ifp);
5743 	*head = pkt_head.cp_mbuf;
5744 	if (tail != NULL) {
5745 		*tail = pkt_tail.cp_mbuf;
5746 	}
5747 	return rc;
5748 }
5749 
5750 #if XNU_TARGET_OS_OSX
5751 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5752 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5753     const struct sockaddr *dest, const char *dest_linkaddr,
5754     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5755 {
5756 	if (pre != NULL) {
5757 		*pre = 0;
5758 	}
5759 	if (post != NULL) {
5760 		*post = 0;
5761 	}
5762 
5763 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5764 }
5765 #endif /* XNU_TARGET_OS_OSX */
5766 
5767 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5768 packet_has_vlan_tag(struct mbuf * m)
5769 {
5770 	u_int   tag = 0;
5771 
5772 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5773 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5774 		if (tag == 0) {
5775 			/* the packet is just priority-tagged, clear the bit */
5776 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5777 		}
5778 	}
5779 	return tag != 0;
5780 }
5781 
5782 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5783 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5784     char **frame_header_p, protocol_family_t protocol_family)
5785 {
5786 	boolean_t               is_vlan_packet = FALSE;
5787 	struct ifnet_filter     *filter;
5788 	struct mbuf             *m = *m_p;
5789 
5790 	is_vlan_packet = packet_has_vlan_tag(m);
5791 
5792 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5793 		return 0;
5794 	}
5795 
5796 	/*
5797 	 * Pass the inbound packet to the interface filters
5798 	 */
5799 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5800 	/* prevent filter list from changing in case we drop the lock */
5801 	if_flt_monitor_busy(ifp);
5802 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5803 		int result;
5804 
5805 		/* exclude VLAN packets from external filters PR-3586856 */
5806 		if (is_vlan_packet &&
5807 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5808 			continue;
5809 		}
5810 
5811 		if (!filter->filt_skip && filter->filt_input != NULL &&
5812 		    (filter->filt_protocol == 0 ||
5813 		    filter->filt_protocol == protocol_family)) {
5814 			lck_mtx_unlock(&ifp->if_flt_lock);
5815 
5816 			result = (*filter->filt_input)(filter->filt_cookie,
5817 			    ifp, protocol_family, m_p, frame_header_p);
5818 
5819 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5820 			if (result != 0) {
5821 				/* we're done with the filter list */
5822 				if_flt_monitor_unbusy(ifp);
5823 				lck_mtx_unlock(&ifp->if_flt_lock);
5824 				return result;
5825 			}
5826 		}
5827 	}
5828 	/* we're done with the filter list */
5829 	if_flt_monitor_unbusy(ifp);
5830 	lck_mtx_unlock(&ifp->if_flt_lock);
5831 
5832 	/*
5833 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5834 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5835 	 */
5836 	if (*m_p != NULL) {
5837 		(*m_p)->m_flags &= ~M_PROTO1;
5838 	}
5839 
5840 	return 0;
5841 }
5842 
5843 __attribute__((noinline))
5844 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5845 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5846     protocol_family_t protocol_family)
5847 {
5848 	boolean_t               is_vlan_packet;
5849 	struct ifnet_filter     *filter;
5850 	struct mbuf             *m = *m_p;
5851 
5852 	is_vlan_packet = packet_has_vlan_tag(m);
5853 
5854 	/*
5855 	 * Pass the outbound packet to the interface filters
5856 	 */
5857 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5858 	/* prevent filter list from changing in case we drop the lock */
5859 	if_flt_monitor_busy(ifp);
5860 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5861 		int result;
5862 
5863 		/* exclude VLAN packets from external filters PR-3586856 */
5864 		if (is_vlan_packet &&
5865 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5866 			continue;
5867 		}
5868 
5869 		if (!filter->filt_skip && filter->filt_output != NULL &&
5870 		    (filter->filt_protocol == 0 ||
5871 		    filter->filt_protocol == protocol_family)) {
5872 			lck_mtx_unlock(&ifp->if_flt_lock);
5873 
5874 			result = filter->filt_output(filter->filt_cookie, ifp,
5875 			    protocol_family, m_p);
5876 
5877 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5878 			if (result != 0) {
5879 				/* we're done with the filter list */
5880 				if_flt_monitor_unbusy(ifp);
5881 				lck_mtx_unlock(&ifp->if_flt_lock);
5882 				return result;
5883 			}
5884 		}
5885 	}
5886 	/* we're done with the filter list */
5887 	if_flt_monitor_unbusy(ifp);
5888 	lck_mtx_unlock(&ifp->if_flt_lock);
5889 
5890 	return 0;
5891 }
5892 
5893 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5894 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5895 {
5896 	int error;
5897 
5898 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5899 		/* Version 1 protocols get one packet at a time */
5900 		while (m != NULL) {
5901 			char *  frame_header;
5902 			mbuf_t  next_packet;
5903 
5904 			next_packet = m->m_nextpkt;
5905 			m->m_nextpkt = NULL;
5906 			frame_header = m->m_pkthdr.pkt_hdr;
5907 			m->m_pkthdr.pkt_hdr = NULL;
5908 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5909 			    ifproto->protocol_family, m, frame_header);
5910 			if (error != 0 && error != EJUSTRETURN) {
5911 				m_freem(m);
5912 			}
5913 			m = next_packet;
5914 		}
5915 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5916 		/* Version 2 protocols support packet lists */
5917 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5918 		    ifproto->protocol_family, m);
5919 		if (error != 0 && error != EJUSTRETURN) {
5920 			m_freem_list(m);
5921 		}
5922 	}
5923 }
5924 
5925 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5926 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5927     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5928 {
5929 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5930 
5931 	if (s->packets_in != 0) {
5932 		d->packets_in += s->packets_in;
5933 	}
5934 	if (s->bytes_in != 0) {
5935 		d->bytes_in += s->bytes_in;
5936 	}
5937 	if (s->errors_in != 0) {
5938 		d->errors_in += s->errors_in;
5939 	}
5940 
5941 	if (s->packets_out != 0) {
5942 		d->packets_out += s->packets_out;
5943 	}
5944 	if (s->bytes_out != 0) {
5945 		d->bytes_out += s->bytes_out;
5946 	}
5947 	if (s->errors_out != 0) {
5948 		d->errors_out += s->errors_out;
5949 	}
5950 
5951 	if (s->collisions != 0) {
5952 		d->collisions += s->collisions;
5953 	}
5954 	if (s->dropped != 0) {
5955 		d->dropped += s->dropped;
5956 	}
5957 
5958 	if (poll) {
5959 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5960 	}
5961 }
5962 
5963 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5964 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5965 {
5966 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5967 
5968 	/*
5969 	 * Use of atomic operations is unavoidable here because
5970 	 * these stats may also be incremented elsewhere via KPIs.
5971 	 */
5972 	if (s->packets_in != 0) {
5973 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5974 		s->packets_in = 0;
5975 	}
5976 	if (s->bytes_in != 0) {
5977 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5978 		s->bytes_in = 0;
5979 	}
5980 	if (s->errors_in != 0) {
5981 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5982 		s->errors_in = 0;
5983 	}
5984 
5985 	if (s->packets_out != 0) {
5986 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5987 		s->packets_out = 0;
5988 	}
5989 	if (s->bytes_out != 0) {
5990 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5991 		s->bytes_out = 0;
5992 	}
5993 	if (s->errors_out != 0) {
5994 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5995 		s->errors_out = 0;
5996 	}
5997 
5998 	if (s->collisions != 0) {
5999 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
6000 		s->collisions = 0;
6001 	}
6002 	if (s->dropped != 0) {
6003 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6004 		s->dropped = 0;
6005 	}
6006 
6007 	/*
6008 	 * No need for atomic operations as they are modified here
6009 	 * only from within the DLIL input thread context.
6010 	 */
6011 	if (ifp->if_poll_tstats.packets != 0) {
6012 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6013 		ifp->if_poll_tstats.packets = 0;
6014 	}
6015 	if (ifp->if_poll_tstats.bytes != 0) {
6016 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6017 		ifp->if_poll_tstats.bytes = 0;
6018 	}
6019 
6020 	return ifp->if_data_threshold != 0;
6021 }
6022 
6023 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6024 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6025 {
6026 	return dlil_input_packet_list_common(ifp, m, 0,
6027 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6028 }
6029 
6030 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6031 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6032     u_int32_t cnt, ifnet_model_t mode)
6033 {
6034 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6035 }
6036 
6037 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6038 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6039     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6040 {
6041 	int error = 0;
6042 	protocol_family_t protocol_family;
6043 	mbuf_t next_packet;
6044 	ifnet_t ifp = ifp_param;
6045 	char *frame_header = NULL;
6046 	struct if_proto *last_ifproto = NULL;
6047 	mbuf_t pkt_first = NULL;
6048 	mbuf_t *pkt_next = NULL;
6049 	u_int32_t poll_thresh = 0, poll_ival = 0;
6050 	int iorefcnt = 0;
6051 
6052 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6053 
6054 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6055 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6056 		poll_thresh = cnt;
6057 	}
6058 
6059 	while (m != NULL) {
6060 		struct if_proto *ifproto = NULL;
6061 		uint32_t pktf_mask;     /* pkt flags to preserve */
6062 
6063 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6064 
6065 		if (ifp_param == NULL) {
6066 			ifp = m->m_pkthdr.rcvif;
6067 		}
6068 
6069 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6070 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6071 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6072 			ifnet_poll(ifp);
6073 		}
6074 
6075 		/* Check if this mbuf looks valid */
6076 		MBUF_INPUT_CHECK(m, ifp);
6077 
6078 		next_packet = m->m_nextpkt;
6079 		m->m_nextpkt = NULL;
6080 		frame_header = m->m_pkthdr.pkt_hdr;
6081 		m->m_pkthdr.pkt_hdr = NULL;
6082 
6083 		/*
6084 		 * Get an IO reference count if the interface is not
6085 		 * loopback (lo0) and it is attached; lo0 never goes
6086 		 * away, so optimize for that.
6087 		 */
6088 		if (ifp != lo_ifp) {
6089 			/* iorefcnt is 0 if it hasn't been taken yet */
6090 			if (iorefcnt == 0) {
6091 				if (!ifnet_datamov_begin(ifp)) {
6092 					m_freem(m);
6093 					goto next;
6094 				}
6095 			}
6096 			iorefcnt = 1;
6097 			/*
6098 			 * Preserve the time stamp and skip pktap flags.
6099 			 */
6100 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6101 		} else {
6102 			/*
6103 			 * If this arrived on lo0, preserve interface addr
6104 			 * info to allow for connectivity between loopback
6105 			 * and local interface addresses.
6106 			 */
6107 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6108 		}
6109 		pktf_mask |= PKTF_WAKE_PKT;
6110 
6111 		/* make sure packet comes in clean */
6112 		m_classifier_init(m, pktf_mask);
6113 
6114 		ifp_inc_traffic_class_in(ifp, m);
6115 
6116 		/* find which protocol family this packet is for */
6117 		ifnet_lock_shared(ifp);
6118 		error = (*ifp->if_demux)(ifp, m, frame_header,
6119 		    &protocol_family);
6120 		ifnet_lock_done(ifp);
6121 		if (error != 0) {
6122 			if (error == EJUSTRETURN) {
6123 				goto next;
6124 			}
6125 			protocol_family = 0;
6126 		}
6127 
6128 #if (DEVELOPMENT || DEBUG)
6129 		/*
6130 		 * For testing we do not care about broadcast and multicast packets as
6131 		 * they are not as controllable as unicast traffic
6132 		 */
6133 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6134 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6135 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6136 				/*
6137 				 * This is a one-shot command
6138 				 */
6139 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6140 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6141 			}
6142 		}
6143 #endif /* (DEVELOPMENT || DEBUG) */
6144 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6145 			char buffer[64];
6146 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6147 
6148 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6149 			    ifp->if_xname, m_pktlen(m));
6150 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6151 				log_hexdump(buffer, buflen);
6152 			}
6153 		}
6154 
6155 		pktap_input(ifp, protocol_family, m, frame_header);
6156 
6157 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6158 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6159 		    ifp->if_type == IFT_CELLULAR) {
6160 			m_freem(m);
6161 			ip6stat.ip6s_clat464_in_v4_drop++;
6162 			goto next;
6163 		}
6164 
6165 		/* Translate the packet if it is received on CLAT interface */
6166 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6167 		    && dlil_is_clat_needed(protocol_family, m)) {
6168 			char *data = NULL;
6169 			struct ether_header eh;
6170 			struct ether_header *ehp = NULL;
6171 
6172 			if (ifp->if_type == IFT_ETHER) {
6173 				ehp = (struct ether_header *)(void *)frame_header;
6174 				/* Skip RX Ethernet packets if they are not IPV6 */
6175 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6176 					goto skip_clat;
6177 				}
6178 
6179 				/* Keep a copy of frame_header for Ethernet packets */
6180 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6181 			}
6182 			error = dlil_clat64(ifp, &protocol_family, &m);
6183 			data = (char *) mbuf_data(m);
6184 			if (error != 0) {
6185 				m_freem(m);
6186 				ip6stat.ip6s_clat464_in_drop++;
6187 				goto next;
6188 			}
6189 			/* Native v6 should be No-op */
6190 			if (protocol_family != PF_INET) {
6191 				goto skip_clat;
6192 			}
6193 
6194 			/* Do this only for translated v4 packets. */
6195 			switch (ifp->if_type) {
6196 			case IFT_CELLULAR:
6197 				frame_header = data;
6198 				break;
6199 			case IFT_ETHER:
6200 				/*
6201 				 * Drop if the mbuf doesn't have enough
6202 				 * space for Ethernet header
6203 				 */
6204 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6205 					m_free(m);
6206 					ip6stat.ip6s_clat464_in_drop++;
6207 					goto next;
6208 				}
6209 				/*
6210 				 * Set the frame_header ETHER_HDR_LEN bytes
6211 				 * preceeding the data pointer. Change
6212 				 * the ether_type too.
6213 				 */
6214 				frame_header = data - ETHER_HDR_LEN;
6215 				eh.ether_type = htons(ETHERTYPE_IP);
6216 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6217 				break;
6218 			}
6219 		}
6220 skip_clat:
6221 		/*
6222 		 * Match the wake packet against the list of ports that has been
6223 		 * been queried by the driver before the device went to sleep
6224 		 */
6225 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6226 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6227 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6228 			}
6229 		}
6230 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6231 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6232 			dlil_input_cksum_dbg(ifp, m, frame_header,
6233 			    protocol_family);
6234 		}
6235 		/*
6236 		 * For partial checksum offload, we expect the driver to
6237 		 * set the start offset indicating the start of the span
6238 		 * that is covered by the hardware-computed checksum;
6239 		 * adjust this start offset accordingly because the data
6240 		 * pointer has been advanced beyond the link-layer header.
6241 		 *
6242 		 * Virtual lan types (bridge, vlan, bond) can call
6243 		 * dlil_input_packet_list() with the same packet with the
6244 		 * checksum flags set. Set a flag indicating that the
6245 		 * adjustment has already been done.
6246 		 */
6247 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6248 			/* adjustment has already been done */
6249 		} else if ((m->m_pkthdr.csum_flags &
6250 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6251 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6252 			int adj;
6253 			if (frame_header == NULL ||
6254 			    frame_header < (char *)mbuf_datastart(m) ||
6255 			    frame_header > (char *)m->m_data ||
6256 			    (adj = (int)(m->m_data - frame_header)) >
6257 			    m->m_pkthdr.csum_rx_start) {
6258 				m->m_pkthdr.csum_data = 0;
6259 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6260 				hwcksum_in_invalidated++;
6261 			} else {
6262 				m->m_pkthdr.csum_rx_start -= adj;
6263 			}
6264 			/* make sure we don't adjust more than once */
6265 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6266 		}
6267 		if (clat_debug) {
6268 			pktap_input(ifp, protocol_family, m, frame_header);
6269 		}
6270 
6271 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6272 			atomic_add_64(&ifp->if_imcasts, 1);
6273 		}
6274 
6275 		/* run interface filters */
6276 		error = dlil_interface_filters_input(ifp, &m,
6277 		    &frame_header, protocol_family);
6278 		if (error != 0) {
6279 			if (error != EJUSTRETURN) {
6280 				m_freem(m);
6281 			}
6282 			goto next;
6283 		}
6284 		/*
6285 		 * A VLAN interface receives VLAN-tagged packets by attaching
6286 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6287 		 * interface is a member of a bridge, the parent interface
6288 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6289 		 * M_PROMISC packet must be processed by the VLAN protocol
6290 		 * so that it can be sent up the stack via
6291 		 * dlil_input_packet_list(). That allows the bridge interface's
6292 		 * input filter, attached to the VLAN interface, to process
6293 		 * the packet.
6294 		 */
6295 		if (protocol_family != PF_VLAN &&
6296 		    (m->m_flags & M_PROMISC) != 0) {
6297 			m_freem(m);
6298 			goto next;
6299 		}
6300 
6301 		/* Lookup the protocol attachment to this interface */
6302 		if (protocol_family == 0) {
6303 			ifproto = NULL;
6304 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6305 		    (last_ifproto->protocol_family == protocol_family)) {
6306 			VERIFY(ifproto == NULL);
6307 			ifproto = last_ifproto;
6308 			if_proto_ref(last_ifproto);
6309 		} else {
6310 			VERIFY(ifproto == NULL);
6311 			ifnet_lock_shared(ifp);
6312 			/* callee holds a proto refcnt upon success */
6313 			ifproto = find_attached_proto(ifp, protocol_family);
6314 			ifnet_lock_done(ifp);
6315 		}
6316 		if (ifproto == NULL) {
6317 			/* no protocol for this packet, discard */
6318 			m_freem(m);
6319 			goto next;
6320 		}
6321 		if (ifproto != last_ifproto) {
6322 			if (last_ifproto != NULL) {
6323 				/* pass up the list for the previous protocol */
6324 				dlil_ifproto_input(last_ifproto, pkt_first);
6325 				pkt_first = NULL;
6326 				if_proto_free(last_ifproto);
6327 			}
6328 			last_ifproto = ifproto;
6329 			if_proto_ref(ifproto);
6330 		}
6331 		/* extend the list */
6332 		m->m_pkthdr.pkt_hdr = frame_header;
6333 		if (pkt_first == NULL) {
6334 			pkt_first = m;
6335 		} else {
6336 			*pkt_next = m;
6337 		}
6338 		pkt_next = &m->m_nextpkt;
6339 
6340 next:
6341 		if (next_packet == NULL && last_ifproto != NULL) {
6342 			/* pass up the last list of packets */
6343 			dlil_ifproto_input(last_ifproto, pkt_first);
6344 			if_proto_free(last_ifproto);
6345 			last_ifproto = NULL;
6346 		}
6347 		if (ifproto != NULL) {
6348 			if_proto_free(ifproto);
6349 			ifproto = NULL;
6350 		}
6351 
6352 		m = next_packet;
6353 
6354 		/* update the driver's multicast filter, if needed */
6355 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6356 			ifp->if_updatemcasts = 0;
6357 		}
6358 		if (iorefcnt == 1) {
6359 			/* If the next mbuf is on a different interface, unlock data-mov */
6360 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6361 				ifnet_datamov_end(ifp);
6362 				iorefcnt = 0;
6363 			}
6364 		}
6365 	}
6366 
6367 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6368 }
6369 
6370 errno_t
if_mcasts_update(struct ifnet * ifp)6371 if_mcasts_update(struct ifnet *ifp)
6372 {
6373 	errno_t err;
6374 
6375 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6376 	if (err == EAFNOSUPPORT) {
6377 		err = 0;
6378 	}
6379 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6380 	    "(err=%d)\n", if_name(ifp),
6381 	    (err == 0 ? "successfully restored" : "failed to restore"),
6382 	    ifp->if_updatemcasts, err);
6383 
6384 	/* just return success */
6385 	return 0;
6386 }
6387 
6388 /* If ifp is set, we will increment the generation for the interface */
6389 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6390 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6391 {
6392 	if (ifp != NULL) {
6393 		ifnet_increment_generation(ifp);
6394 	}
6395 
6396 #if NECP
6397 	necp_update_all_clients();
6398 #endif /* NECP */
6399 
6400 	return kev_post_msg(event);
6401 }
6402 
6403 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6404 dlil_post_sifflags_msg(struct ifnet * ifp)
6405 {
6406 	struct kev_msg ev_msg;
6407 	struct net_event_data ev_data;
6408 
6409 	bzero(&ev_data, sizeof(ev_data));
6410 	bzero(&ev_msg, sizeof(ev_msg));
6411 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6412 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6413 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6414 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6415 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6416 	ev_data.if_family = ifp->if_family;
6417 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6418 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6419 	ev_msg.dv[0].data_ptr = &ev_data;
6420 	ev_msg.dv[1].data_length = 0;
6421 	dlil_post_complete_msg(ifp, &ev_msg);
6422 }
6423 
6424 #define TMP_IF_PROTO_ARR_SIZE   10
6425 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6426 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6427 {
6428 	struct ifnet_filter *filter = NULL;
6429 	struct if_proto *proto = NULL;
6430 	int if_proto_count = 0;
6431 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6432 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6433 	int tmp_ifproto_arr_idx = 0;
6434 
6435 	/*
6436 	 * Pass the event to the interface filters
6437 	 */
6438 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6439 	/* prevent filter list from changing in case we drop the lock */
6440 	if_flt_monitor_busy(ifp);
6441 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6442 		if (filter->filt_event != NULL) {
6443 			lck_mtx_unlock(&ifp->if_flt_lock);
6444 
6445 			filter->filt_event(filter->filt_cookie, ifp,
6446 			    filter->filt_protocol, event);
6447 
6448 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6449 		}
6450 	}
6451 	/* we're done with the filter list */
6452 	if_flt_monitor_unbusy(ifp);
6453 	lck_mtx_unlock(&ifp->if_flt_lock);
6454 
6455 	/* Get an io ref count if the interface is attached */
6456 	if (!ifnet_is_attached(ifp, 1)) {
6457 		goto done;
6458 	}
6459 
6460 	/*
6461 	 * An embedded tmp_list_entry in if_proto may still get
6462 	 * over-written by another thread after giving up ifnet lock,
6463 	 * therefore we are avoiding embedded pointers here.
6464 	 */
6465 	ifnet_lock_shared(ifp);
6466 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6467 	if (if_proto_count) {
6468 		int i;
6469 		VERIFY(ifp->if_proto_hash != NULL);
6470 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6471 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6472 		} else {
6473 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6474 			    if_proto_count, Z_WAITOK | Z_ZERO);
6475 			if (tmp_ifproto_arr == NULL) {
6476 				ifnet_lock_done(ifp);
6477 				goto cleanup;
6478 			}
6479 		}
6480 
6481 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6482 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6483 			    next_hash) {
6484 				if_proto_ref(proto);
6485 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6486 				tmp_ifproto_arr_idx++;
6487 			}
6488 		}
6489 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6490 	}
6491 	ifnet_lock_done(ifp);
6492 
6493 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6494 	    tmp_ifproto_arr_idx++) {
6495 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6496 		VERIFY(proto != NULL);
6497 		proto_media_event eventp =
6498 		    (proto->proto_kpi == kProtoKPI_v1 ?
6499 		    proto->kpi.v1.event :
6500 		    proto->kpi.v2.event);
6501 
6502 		if (eventp != NULL) {
6503 			eventp(ifp, proto->protocol_family,
6504 			    event);
6505 		}
6506 		if_proto_free(proto);
6507 	}
6508 
6509 cleanup:
6510 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6511 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6512 	}
6513 
6514 	/* Pass the event to the interface */
6515 	if (ifp->if_event != NULL) {
6516 		ifp->if_event(ifp, event);
6517 	}
6518 
6519 	/* Release the io ref count */
6520 	ifnet_decr_iorefcnt(ifp);
6521 done:
6522 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6523 }
6524 
6525 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6526 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6527 {
6528 	struct kev_msg kev_msg;
6529 	int result = 0;
6530 
6531 	if (ifp == NULL || event == NULL) {
6532 		return EINVAL;
6533 	}
6534 
6535 	bzero(&kev_msg, sizeof(kev_msg));
6536 	kev_msg.vendor_code = event->vendor_code;
6537 	kev_msg.kev_class = event->kev_class;
6538 	kev_msg.kev_subclass = event->kev_subclass;
6539 	kev_msg.event_code = event->event_code;
6540 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6541 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6542 	kev_msg.dv[1].data_length = 0;
6543 
6544 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6545 
6546 	return result;
6547 }
6548 
6549 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6550 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6551 {
6552 	mbuf_t  n = m;
6553 	int chainlen = 0;
6554 
6555 	while (n != NULL) {
6556 		chainlen++;
6557 		n = n->m_next;
6558 	}
6559 	switch (chainlen) {
6560 	case 0:
6561 		break;
6562 	case 1:
6563 		atomic_add_64(&cls->cls_one, 1);
6564 		break;
6565 	case 2:
6566 		atomic_add_64(&cls->cls_two, 1);
6567 		break;
6568 	case 3:
6569 		atomic_add_64(&cls->cls_three, 1);
6570 		break;
6571 	case 4:
6572 		atomic_add_64(&cls->cls_four, 1);
6573 		break;
6574 	case 5:
6575 	default:
6576 		atomic_add_64(&cls->cls_five_or_more, 1);
6577 		break;
6578 	}
6579 }
6580 
6581 #if CONFIG_DTRACE
6582 __attribute__((noinline))
6583 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6584 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6585 {
6586 	if (proto_family == PF_INET) {
6587 		struct ip *ip = mtod(m, struct ip *);
6588 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6589 		    struct ip *, ip, struct ifnet *, ifp,
6590 		    struct ip *, ip, struct ip6_hdr *, NULL);
6591 	} else if (proto_family == PF_INET6) {
6592 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6593 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6594 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6595 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6596 	}
6597 }
6598 #endif /* CONFIG_DTRACE */
6599 
6600 /*
6601  * dlil_output
6602  *
6603  * Caller should have a lock on the protocol domain if the protocol
6604  * doesn't support finer grained locking. In most cases, the lock
6605  * will be held from the socket layer and won't be released until
6606  * we return back to the socket layer.
6607  *
6608  * This does mean that we must take a protocol lock before we take
6609  * an interface lock if we're going to take both. This makes sense
6610  * because a protocol is likely to interact with an ifp while it
6611  * is under the protocol lock.
6612  *
6613  * An advisory code will be returned if adv is not null. This
6614  * can be used to provide feedback about interface queues to the
6615  * application.
6616  */
6617 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6618 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6619     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6620 {
6621 	char *frame_type = NULL;
6622 	char *dst_linkaddr = NULL;
6623 	int retval = 0;
6624 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6625 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6626 	struct if_proto *proto = NULL;
6627 	mbuf_t  m = NULL;
6628 	mbuf_t  send_head = NULL;
6629 	mbuf_t  *send_tail = &send_head;
6630 	int iorefcnt = 0;
6631 	u_int32_t pre = 0, post = 0;
6632 	u_int32_t fpkts = 0, fbytes = 0;
6633 	int32_t flen = 0;
6634 	struct timespec now;
6635 	u_int64_t now_nsec;
6636 	boolean_t did_clat46 = FALSE;
6637 	protocol_family_t old_proto_family = proto_family;
6638 	struct sockaddr_in6 dest6;
6639 	struct rtentry *rt = NULL;
6640 	u_int32_t m_loop_set = 0;
6641 
6642 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6643 
6644 	/*
6645 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6646 	 * from happening while this operation is in progress
6647 	 */
6648 	if (!ifnet_datamov_begin(ifp)) {
6649 		retval = ENXIO;
6650 		goto cleanup;
6651 	}
6652 	iorefcnt = 1;
6653 
6654 	VERIFY(ifp->if_output_dlil != NULL);
6655 
6656 	/* update the driver's multicast filter, if needed */
6657 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6658 		ifp->if_updatemcasts = 0;
6659 	}
6660 
6661 	frame_type = frame_type_buffer;
6662 	dst_linkaddr = dst_linkaddr_buffer;
6663 
6664 	if (raw == 0) {
6665 		ifnet_lock_shared(ifp);
6666 		/* callee holds a proto refcnt upon success */
6667 		proto = find_attached_proto(ifp, proto_family);
6668 		if (proto == NULL) {
6669 			ifnet_lock_done(ifp);
6670 			retval = ENXIO;
6671 			goto cleanup;
6672 		}
6673 		ifnet_lock_done(ifp);
6674 	}
6675 
6676 preout_again:
6677 	if (packetlist == NULL) {
6678 		goto cleanup;
6679 	}
6680 
6681 	m = packetlist;
6682 	packetlist = packetlist->m_nextpkt;
6683 	m->m_nextpkt = NULL;
6684 
6685 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6686 
6687 	/*
6688 	 * Perform address family translation for the first
6689 	 * packet outside the loop in order to perform address
6690 	 * lookup for the translated proto family.
6691 	 */
6692 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6693 	    (ifp->if_type == IFT_CELLULAR ||
6694 	    dlil_is_clat_needed(proto_family, m))) {
6695 		retval = dlil_clat46(ifp, &proto_family, &m);
6696 		/*
6697 		 * Go to the next packet if translation fails
6698 		 */
6699 		if (retval != 0) {
6700 			m_freem(m);
6701 			m = NULL;
6702 			ip6stat.ip6s_clat464_out_drop++;
6703 			/* Make sure that the proto family is PF_INET */
6704 			ASSERT(proto_family == PF_INET);
6705 			goto preout_again;
6706 		}
6707 		/*
6708 		 * Free the old one and make it point to the IPv6 proto structure.
6709 		 *
6710 		 * Change proto for the first time we have successfully
6711 		 * performed address family translation.
6712 		 */
6713 		if (!did_clat46 && proto_family == PF_INET6) {
6714 			did_clat46 = TRUE;
6715 
6716 			if (proto != NULL) {
6717 				if_proto_free(proto);
6718 			}
6719 			ifnet_lock_shared(ifp);
6720 			/* callee holds a proto refcnt upon success */
6721 			proto = find_attached_proto(ifp, proto_family);
6722 			if (proto == NULL) {
6723 				ifnet_lock_done(ifp);
6724 				retval = ENXIO;
6725 				m_freem(m);
6726 				m = NULL;
6727 				goto cleanup;
6728 			}
6729 			ifnet_lock_done(ifp);
6730 			if (ifp->if_type == IFT_ETHER) {
6731 				/* Update the dest to translated v6 address */
6732 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6733 				dest6.sin6_family = AF_INET6;
6734 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6735 				dest = (const struct sockaddr *)&dest6;
6736 
6737 				/*
6738 				 * Lookup route to the translated destination
6739 				 * Free this route ref during cleanup
6740 				 */
6741 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6742 				    0, 0, ifp->if_index);
6743 
6744 				route = rt;
6745 			}
6746 		}
6747 	}
6748 
6749 	/*
6750 	 * This path gets packet chain going to the same destination.
6751 	 * The pre output routine is used to either trigger resolution of
6752 	 * the next hop or retreive the next hop's link layer addressing.
6753 	 * For ex: ether_inet(6)_pre_output routine.
6754 	 *
6755 	 * If the routine returns EJUSTRETURN, it implies that packet has
6756 	 * been queued, and therefore we have to call preout_again for the
6757 	 * following packet in the chain.
6758 	 *
6759 	 * For errors other than EJUSTRETURN, the current packet is freed
6760 	 * and the rest of the chain (pointed by packetlist is freed as
6761 	 * part of clean up.
6762 	 *
6763 	 * Else if there is no error the retrieved information is used for
6764 	 * all the packets in the chain.
6765 	 */
6766 	if (raw == 0) {
6767 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6768 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6769 		retval = 0;
6770 		if (preoutp != NULL) {
6771 			retval = preoutp(ifp, proto_family, &m, dest, route,
6772 			    frame_type, dst_linkaddr);
6773 
6774 			if (retval != 0) {
6775 				if (retval == EJUSTRETURN) {
6776 					goto preout_again;
6777 				}
6778 				m_freem(m);
6779 				m = NULL;
6780 				goto cleanup;
6781 			}
6782 		}
6783 	}
6784 
6785 	do {
6786 		/*
6787 		 * pkt_hdr is set here to point to m_data prior to
6788 		 * calling into the framer. This value of pkt_hdr is
6789 		 * used by the netif gso logic to retrieve the ip header
6790 		 * for the TCP packets, offloaded for TSO processing.
6791 		 */
6792 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6793 			uint8_t vlan_encap_len = 0;
6794 
6795 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6796 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6797 			}
6798 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6799 		} else {
6800 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6801 		}
6802 
6803 		/*
6804 		 * Perform address family translation if needed.
6805 		 * For now we only support stateless 4 to 6 translation
6806 		 * on the out path.
6807 		 *
6808 		 * The routine below translates IP header, updates protocol
6809 		 * checksum and also translates ICMP.
6810 		 *
6811 		 * We skip the first packet as it is already translated and
6812 		 * the proto family is set to PF_INET6.
6813 		 */
6814 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6815 		    (ifp->if_type == IFT_CELLULAR ||
6816 		    dlil_is_clat_needed(proto_family, m))) {
6817 			retval = dlil_clat46(ifp, &proto_family, &m);
6818 			/* Goto the next packet if the translation fails */
6819 			if (retval != 0) {
6820 				m_freem(m);
6821 				m = NULL;
6822 				ip6stat.ip6s_clat464_out_drop++;
6823 				goto next;
6824 			}
6825 		}
6826 
6827 #if CONFIG_DTRACE
6828 		if (!raw) {
6829 			dlil_output_dtrace(ifp, proto_family, m);
6830 		}
6831 #endif /* CONFIG_DTRACE */
6832 
6833 		if (raw == 0 && ifp->if_framer != NULL) {
6834 			int rcvif_set = 0;
6835 
6836 			/*
6837 			 * If this is a broadcast packet that needs to be
6838 			 * looped back into the system, set the inbound ifp
6839 			 * to that of the outbound ifp.  This will allow
6840 			 * us to determine that it is a legitimate packet
6841 			 * for the system.  Only set the ifp if it's not
6842 			 * already set, just to be safe.
6843 			 */
6844 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6845 			    m->m_pkthdr.rcvif == NULL) {
6846 				m->m_pkthdr.rcvif = ifp;
6847 				rcvif_set = 1;
6848 			}
6849 			m_loop_set = m->m_flags & M_LOOP;
6850 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6851 			    frame_type, &pre, &post);
6852 			if (retval != 0) {
6853 				if (retval != EJUSTRETURN) {
6854 					m_freem(m);
6855 				}
6856 				goto next;
6857 			}
6858 
6859 			/*
6860 			 * For partial checksum offload, adjust the start
6861 			 * and stuff offsets based on the prepended header.
6862 			 */
6863 			if ((m->m_pkthdr.csum_flags &
6864 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6865 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6866 				m->m_pkthdr.csum_tx_stuff += pre;
6867 				m->m_pkthdr.csum_tx_start += pre;
6868 			}
6869 
6870 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6871 				dlil_output_cksum_dbg(ifp, m, pre,
6872 				    proto_family);
6873 			}
6874 
6875 			/*
6876 			 * Clear the ifp if it was set above, and to be
6877 			 * safe, only if it is still the same as the
6878 			 * outbound ifp we have in context.  If it was
6879 			 * looped back, then a copy of it was sent to the
6880 			 * loopback interface with the rcvif set, and we
6881 			 * are clearing the one that will go down to the
6882 			 * layer below.
6883 			 */
6884 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6885 				m->m_pkthdr.rcvif = NULL;
6886 			}
6887 		}
6888 
6889 		/*
6890 		 * Let interface filters (if any) do their thing ...
6891 		 */
6892 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6893 		if (retval != 0) {
6894 			if (retval != EJUSTRETURN) {
6895 				m_freem(m);
6896 			}
6897 			goto next;
6898 		}
6899 		/*
6900 		 * Strip away M_PROTO1 bit prior to sending packet
6901 		 * to the driver as this field may be used by the driver
6902 		 */
6903 		m->m_flags &= ~M_PROTO1;
6904 
6905 		/*
6906 		 * If the underlying interface is not capable of handling a
6907 		 * packet whose data portion spans across physically disjoint
6908 		 * pages, we need to "normalize" the packet so that we pass
6909 		 * down a chain of mbufs where each mbuf points to a span that
6910 		 * resides in the system page boundary.  If the packet does
6911 		 * not cross page(s), the following is a no-op.
6912 		 */
6913 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6914 			if ((m = m_normalize(m)) == NULL) {
6915 				goto next;
6916 			}
6917 		}
6918 
6919 		/*
6920 		 * If this is a TSO packet, make sure the interface still
6921 		 * advertise TSO capability.
6922 		 */
6923 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6924 			retval = EMSGSIZE;
6925 			m_freem(m);
6926 			goto cleanup;
6927 		}
6928 
6929 		ifp_inc_traffic_class_out(ifp, m);
6930 
6931 #if SKYWALK
6932 		/*
6933 		 * For native skywalk devices, packets will be passed to pktap
6934 		 * after GSO or after the mbuf to packet conversion.
6935 		 * This is done for IPv4/IPv6 packets only because there is no
6936 		 * space in the mbuf to pass down the proto family.
6937 		 */
6938 		if (dlil_is_native_netif_nexus(ifp)) {
6939 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6940 				pktap_output(ifp, proto_family, m, pre, post);
6941 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6942 			}
6943 		} else {
6944 			pktap_output(ifp, proto_family, m, pre, post);
6945 		}
6946 #else /* SKYWALK */
6947 		pktap_output(ifp, proto_family, m, pre, post);
6948 #endif /* SKYWALK */
6949 
6950 		/*
6951 		 * Count the number of elements in the mbuf chain
6952 		 */
6953 		if (tx_chain_len_count) {
6954 			dlil_count_chain_len(m, &tx_chain_len_stats);
6955 		}
6956 
6957 		/*
6958 		 * Record timestamp; ifnet_enqueue() will use this info
6959 		 * rather than redoing the work.  An optimization could
6960 		 * involve doing this just once at the top, if there are
6961 		 * no interface filters attached, but that's probably
6962 		 * not a big deal.
6963 		 */
6964 		nanouptime(&now);
6965 		net_timernsec(&now, &now_nsec);
6966 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6967 
6968 		/*
6969 		 * Discard partial sum information if this packet originated
6970 		 * from another interface; the packet would already have the
6971 		 * final checksum and we shouldn't recompute it.
6972 		 */
6973 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6974 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6975 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6976 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6977 			m->m_pkthdr.csum_data = 0;
6978 		}
6979 
6980 		/*
6981 		 * Finally, call the driver.
6982 		 */
6983 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6984 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6985 				flen += (m_pktlen(m) - (pre + post));
6986 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6987 			}
6988 			*send_tail = m;
6989 			send_tail = &m->m_nextpkt;
6990 		} else {
6991 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6992 				flen = (m_pktlen(m) - (pre + post));
6993 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6994 			} else {
6995 				flen = 0;
6996 			}
6997 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6998 			    0, 0, 0, 0, 0);
6999 			retval = (*ifp->if_output_dlil)(ifp, m);
7000 			if (retval == EQFULL || retval == EQSUSPENDED) {
7001 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7002 					adv->code = (retval == EQFULL ?
7003 					    FADV_FLOW_CONTROLLED :
7004 					    FADV_SUSPENDED);
7005 				}
7006 				retval = 0;
7007 			}
7008 			if (retval == 0 && flen > 0) {
7009 				fbytes += flen;
7010 				fpkts++;
7011 			}
7012 			if (retval != 0 && dlil_verbose) {
7013 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7014 				    __func__, if_name(ifp),
7015 				    retval);
7016 			}
7017 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7018 			    0, 0, 0, 0, 0);
7019 		}
7020 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7021 
7022 next:
7023 		m = packetlist;
7024 		if (m != NULL) {
7025 			m->m_flags |= m_loop_set;
7026 			packetlist = packetlist->m_nextpkt;
7027 			m->m_nextpkt = NULL;
7028 		}
7029 		/* Reset the proto family to old proto family for CLAT */
7030 		if (did_clat46) {
7031 			proto_family = old_proto_family;
7032 		}
7033 	} while (m != NULL);
7034 
7035 	if (send_head != NULL) {
7036 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7037 		    0, 0, 0, 0, 0);
7038 		if (ifp->if_eflags & IFEF_SENDLIST) {
7039 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7040 			if (retval == EQFULL || retval == EQSUSPENDED) {
7041 				if (adv != NULL) {
7042 					adv->code = (retval == EQFULL ?
7043 					    FADV_FLOW_CONTROLLED :
7044 					    FADV_SUSPENDED);
7045 				}
7046 				retval = 0;
7047 			}
7048 			if (retval == 0 && flen > 0) {
7049 				fbytes += flen;
7050 				fpkts++;
7051 			}
7052 			if (retval != 0 && dlil_verbose) {
7053 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7054 				    __func__, if_name(ifp), retval);
7055 			}
7056 		} else {
7057 			struct mbuf *send_m;
7058 			int enq_cnt = 0;
7059 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7060 			while (send_head != NULL) {
7061 				send_m = send_head;
7062 				send_head = send_m->m_nextpkt;
7063 				send_m->m_nextpkt = NULL;
7064 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7065 				if (retval == EQFULL || retval == EQSUSPENDED) {
7066 					if (adv != NULL) {
7067 						adv->code = (retval == EQFULL ?
7068 						    FADV_FLOW_CONTROLLED :
7069 						    FADV_SUSPENDED);
7070 					}
7071 					retval = 0;
7072 				}
7073 				if (retval == 0) {
7074 					enq_cnt++;
7075 					if (flen > 0) {
7076 						fpkts++;
7077 					}
7078 				}
7079 				if (retval != 0 && dlil_verbose) {
7080 					DLIL_PRINTF("%s: output error on %s "
7081 					    "retval = %d\n",
7082 					    __func__, if_name(ifp), retval);
7083 				}
7084 			}
7085 			if (enq_cnt > 0) {
7086 				fbytes += flen;
7087 				ifnet_start(ifp);
7088 			}
7089 		}
7090 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7091 	}
7092 
7093 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7094 
7095 cleanup:
7096 	if (fbytes > 0) {
7097 		ifp->if_fbytes += fbytes;
7098 	}
7099 	if (fpkts > 0) {
7100 		ifp->if_fpackets += fpkts;
7101 	}
7102 	if (proto != NULL) {
7103 		if_proto_free(proto);
7104 	}
7105 	if (packetlist) { /* if any packets are left, clean up */
7106 		mbuf_freem_list(packetlist);
7107 	}
7108 	if (retval == EJUSTRETURN) {
7109 		retval = 0;
7110 	}
7111 	if (iorefcnt == 1) {
7112 		ifnet_datamov_end(ifp);
7113 	}
7114 	if (rt != NULL) {
7115 		rtfree(rt);
7116 		rt = NULL;
7117 	}
7118 
7119 	return retval;
7120 }
7121 
7122 /*
7123  * This routine checks if the destination address is not a loopback, link-local,
7124  * multicast or broadcast address.
7125  */
7126 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7127 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7128 {
7129 	int ret = 0;
7130 	switch (proto_family) {
7131 	case PF_INET: {
7132 		struct ip *iph = mtod(m, struct ip *);
7133 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7134 			ret = 1;
7135 		}
7136 		break;
7137 	}
7138 	case PF_INET6: {
7139 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7140 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7141 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7142 			ret = 1;
7143 		}
7144 		break;
7145 	}
7146 	}
7147 
7148 	return ret;
7149 }
7150 /*
7151  * @brief This routine translates IPv4 packet to IPv6 packet,
7152  *     updates protocol checksum and also translates ICMP for code
7153  *     along with inner header translation.
7154  *
7155  * @param ifp Pointer to the interface
7156  * @param proto_family pointer to protocol family. It is updated if function
7157  *     performs the translation successfully.
7158  * @param m Pointer to the pointer pointing to the packet. Needed because this
7159  *     routine can end up changing the mbuf to a different one.
7160  *
7161  * @return 0 on success or else a negative value.
7162  */
7163 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7164 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7165 {
7166 	VERIFY(*proto_family == PF_INET);
7167 	VERIFY(IS_INTF_CLAT46(ifp));
7168 
7169 	pbuf_t pbuf_store, *pbuf = NULL;
7170 	struct ip *iph = NULL;
7171 	struct in_addr osrc, odst;
7172 	uint8_t proto = 0;
7173 	struct in6_ifaddr *ia6_clat_src = NULL;
7174 	struct in6_addr *src = NULL;
7175 	struct in6_addr dst;
7176 	int error = 0;
7177 	uint16_t off = 0;
7178 	uint16_t tot_len = 0;
7179 	uint16_t ip_id_val = 0;
7180 	uint16_t ip_frag_off = 0;
7181 
7182 	boolean_t is_frag = FALSE;
7183 	boolean_t is_first_frag = TRUE;
7184 	boolean_t is_last_frag = TRUE;
7185 
7186 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7187 	pbuf = &pbuf_store;
7188 	iph = pbuf->pb_data;
7189 
7190 	osrc = iph->ip_src;
7191 	odst = iph->ip_dst;
7192 	proto = iph->ip_p;
7193 	off = (uint16_t)(iph->ip_hl << 2);
7194 	ip_id_val = iph->ip_id;
7195 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7196 
7197 	tot_len = ntohs(iph->ip_len);
7198 
7199 	/*
7200 	 * For packets that are not first frags
7201 	 * we only need to adjust CSUM.
7202 	 * For 4 to 6, Fragmentation header gets appended
7203 	 * after proto translation.
7204 	 */
7205 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7206 		is_frag = TRUE;
7207 
7208 		/* If the offset is not zero, it is not first frag */
7209 		if (ip_frag_off != 0) {
7210 			is_first_frag = FALSE;
7211 		}
7212 
7213 		/* If IP_MF is set, then it is not last frag */
7214 		if (ntohs(iph->ip_off) & IP_MF) {
7215 			is_last_frag = FALSE;
7216 		}
7217 	}
7218 
7219 	/*
7220 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7221 	 * translation.
7222 	 */
7223 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7224 	if (ia6_clat_src == NULL) {
7225 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7226 		error = -1;
7227 		goto cleanup;
7228 	}
7229 
7230 	src = &ia6_clat_src->ia_addr.sin6_addr;
7231 
7232 	/*
7233 	 * Translate IPv4 destination to IPv6 destination by using the
7234 	 * prefixes learned through prior PLAT discovery.
7235 	 */
7236 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7237 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7238 		goto cleanup;
7239 	}
7240 
7241 	/* Translate the IP header part first */
7242 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7243 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7244 
7245 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7246 
7247 	if (error != 0) {
7248 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7249 		goto cleanup;
7250 	}
7251 
7252 	/*
7253 	 * Translate protocol header, update checksum, checksum flags
7254 	 * and related fields.
7255 	 */
7256 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7257 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7258 
7259 	if (error != 0) {
7260 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7261 		goto cleanup;
7262 	}
7263 
7264 	/* Now insert the IPv6 fragment header */
7265 	if (is_frag) {
7266 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7267 
7268 		if (error != 0) {
7269 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7270 			goto cleanup;
7271 		}
7272 	}
7273 
7274 cleanup:
7275 	if (ia6_clat_src != NULL) {
7276 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7277 	}
7278 
7279 	if (pbuf_is_valid(pbuf)) {
7280 		*m = pbuf->pb_mbuf;
7281 		pbuf->pb_mbuf = NULL;
7282 		pbuf_destroy(pbuf);
7283 	} else {
7284 		error = -1;
7285 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7286 	}
7287 
7288 	if (error == 0) {
7289 		*proto_family = PF_INET6;
7290 		ip6stat.ip6s_clat464_out_success++;
7291 	}
7292 
7293 	return error;
7294 }
7295 
7296 /*
7297  * @brief This routine translates incoming IPv6 to IPv4 packet,
7298  *     updates protocol checksum and also translates ICMPv6 outer
7299  *     and inner headers
7300  *
7301  * @return 0 on success or else a negative value.
7302  */
7303 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7304 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7305 {
7306 	VERIFY(*proto_family == PF_INET6);
7307 	VERIFY(IS_INTF_CLAT46(ifp));
7308 
7309 	struct ip6_hdr *ip6h = NULL;
7310 	struct in6_addr osrc, odst;
7311 	uint8_t proto = 0;
7312 	struct in6_ifaddr *ia6_clat_dst = NULL;
7313 	struct in_ifaddr *ia4_clat_dst = NULL;
7314 	struct in_addr *dst = NULL;
7315 	struct in_addr src;
7316 	int error = 0;
7317 	uint32_t off = 0;
7318 	u_int64_t tot_len = 0;
7319 	uint8_t tos = 0;
7320 	boolean_t is_first_frag = TRUE;
7321 
7322 	/* Incoming mbuf does not contain valid IP6 header */
7323 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7324 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7325 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7326 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7327 		return -1;
7328 	}
7329 
7330 	ip6h = mtod(*m, struct ip6_hdr *);
7331 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7332 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7333 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7334 		return -1;
7335 	}
7336 
7337 	osrc = ip6h->ip6_src;
7338 	odst = ip6h->ip6_dst;
7339 
7340 	/*
7341 	 * Retrieve the local CLAT46 reserved IPv6 address.
7342 	 * Let the packet pass if we don't find one, as the flag
7343 	 * may get set before IPv6 configuration has taken place.
7344 	 */
7345 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7346 	if (ia6_clat_dst == NULL) {
7347 		goto done;
7348 	}
7349 
7350 	/*
7351 	 * Check if the original dest in the packet is same as the reserved
7352 	 * CLAT46 IPv6 address
7353 	 */
7354 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7355 		pbuf_t pbuf_store, *pbuf = NULL;
7356 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7357 		pbuf = &pbuf_store;
7358 
7359 		/*
7360 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7361 		 * translation.
7362 		 */
7363 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7364 		if (ia4_clat_dst == NULL) {
7365 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7366 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7367 			error = -1;
7368 			goto cleanup;
7369 		}
7370 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7371 
7372 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7373 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7374 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7375 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7376 			error = -1;
7377 			goto cleanup;
7378 		}
7379 
7380 		ip6h = pbuf->pb_data;
7381 		off = sizeof(struct ip6_hdr);
7382 		proto = ip6h->ip6_nxt;
7383 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7384 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7385 
7386 		/*
7387 		 * Translate the IP header and update the fragmentation
7388 		 * header if needed
7389 		 */
7390 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7391 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7392 		    0 : -1;
7393 
7394 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7395 
7396 		if (error != 0) {
7397 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7398 			goto cleanup;
7399 		}
7400 
7401 		/*
7402 		 * Translate protocol header, update checksum, checksum flags
7403 		 * and related fields.
7404 		 */
7405 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7406 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7407 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7408 
7409 		if (error != 0) {
7410 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7411 			goto cleanup;
7412 		}
7413 
7414 cleanup:
7415 		if (ia4_clat_dst != NULL) {
7416 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7417 		}
7418 
7419 		if (pbuf_is_valid(pbuf)) {
7420 			*m = pbuf->pb_mbuf;
7421 			pbuf->pb_mbuf = NULL;
7422 			pbuf_destroy(pbuf);
7423 		} else {
7424 			error = -1;
7425 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7426 		}
7427 
7428 		if (error == 0) {
7429 			*proto_family = PF_INET;
7430 			ip6stat.ip6s_clat464_in_success++;
7431 		}
7432 	} /* CLAT traffic */
7433 
7434 done:
7435 	return error;
7436 }
7437 
7438 /* The following is used to enqueue work items for ifnet ioctl events */
7439 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7440 
7441 struct ifnet_ioctl_event {
7442 	struct ifnet *ifp;
7443 	u_long ioctl_code;
7444 };
7445 
7446 struct ifnet_ioctl_event_nwk_wq_entry {
7447 	struct nwk_wq_entry nwk_wqe;
7448 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7449 };
7450 
7451 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7452 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7453 {
7454 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7455 
7456 	/*
7457 	 * Get an io ref count if the interface is attached.
7458 	 * At this point it most likely is. We are taking a reference for
7459 	 * deferred processing.
7460 	 */
7461 	if (!ifnet_is_attached(ifp, 1)) {
7462 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7463 		    "is not attached",
7464 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7465 		return;
7466 	}
7467 
7468 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7469 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7470 
7471 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7472 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7473 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7474 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7475 }
7476 
7477 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7478 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7479 {
7480 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7481 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7482 
7483 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7484 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7485 	int ret = 0;
7486 
7487 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7488 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7489 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7490 	} else if (dlil_verbose) {
7491 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7492 		    "for ioctl %lu",
7493 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7494 	}
7495 	ifnet_decr_iorefcnt(ifp);
7496 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7497 	return;
7498 }
7499 
7500 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7501 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7502     void *ioctl_arg)
7503 {
7504 	struct ifnet_filter *filter;
7505 	int retval = EOPNOTSUPP;
7506 	int result = 0;
7507 
7508 	if (ifp == NULL || ioctl_code == 0) {
7509 		return EINVAL;
7510 	}
7511 
7512 	/* Get an io ref count if the interface is attached */
7513 	if (!ifnet_is_attached(ifp, 1)) {
7514 		return EOPNOTSUPP;
7515 	}
7516 
7517 	/*
7518 	 * Run the interface filters first.
7519 	 * We want to run all filters before calling the protocol,
7520 	 * interface family, or interface.
7521 	 */
7522 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7523 	/* prevent filter list from changing in case we drop the lock */
7524 	if_flt_monitor_busy(ifp);
7525 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7526 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7527 		    filter->filt_protocol == proto_fam)) {
7528 			lck_mtx_unlock(&ifp->if_flt_lock);
7529 
7530 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7531 			    proto_fam, ioctl_code, ioctl_arg);
7532 
7533 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7534 
7535 			/* Only update retval if no one has handled the ioctl */
7536 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7537 				if (result == ENOTSUP) {
7538 					result = EOPNOTSUPP;
7539 				}
7540 				retval = result;
7541 				if (retval != 0 && retval != EOPNOTSUPP) {
7542 					/* we're done with the filter list */
7543 					if_flt_monitor_unbusy(ifp);
7544 					lck_mtx_unlock(&ifp->if_flt_lock);
7545 					goto cleanup;
7546 				}
7547 			}
7548 		}
7549 	}
7550 	/* we're done with the filter list */
7551 	if_flt_monitor_unbusy(ifp);
7552 	lck_mtx_unlock(&ifp->if_flt_lock);
7553 
7554 	/* Allow the protocol to handle the ioctl */
7555 	if (proto_fam != 0) {
7556 		struct if_proto *proto;
7557 
7558 		/* callee holds a proto refcnt upon success */
7559 		ifnet_lock_shared(ifp);
7560 		proto = find_attached_proto(ifp, proto_fam);
7561 		ifnet_lock_done(ifp);
7562 		if (proto != NULL) {
7563 			proto_media_ioctl ioctlp =
7564 			    (proto->proto_kpi == kProtoKPI_v1 ?
7565 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7566 			result = EOPNOTSUPP;
7567 			if (ioctlp != NULL) {
7568 				result = ioctlp(ifp, proto_fam, ioctl_code,
7569 				    ioctl_arg);
7570 			}
7571 			if_proto_free(proto);
7572 
7573 			/* Only update retval if no one has handled the ioctl */
7574 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7575 				if (result == ENOTSUP) {
7576 					result = EOPNOTSUPP;
7577 				}
7578 				retval = result;
7579 				if (retval && retval != EOPNOTSUPP) {
7580 					goto cleanup;
7581 				}
7582 			}
7583 		}
7584 	}
7585 
7586 	/* retval is either 0 or EOPNOTSUPP */
7587 
7588 	/*
7589 	 * Let the interface handle this ioctl.
7590 	 * If it returns EOPNOTSUPP, ignore that, we may have
7591 	 * already handled this in the protocol or family.
7592 	 */
7593 	if (ifp->if_ioctl) {
7594 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7595 	}
7596 
7597 	/* Only update retval if no one has handled the ioctl */
7598 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7599 		if (result == ENOTSUP) {
7600 			result = EOPNOTSUPP;
7601 		}
7602 		retval = result;
7603 		if (retval && retval != EOPNOTSUPP) {
7604 			goto cleanup;
7605 		}
7606 	}
7607 
7608 cleanup:
7609 	if (retval == EJUSTRETURN) {
7610 		retval = 0;
7611 	}
7612 
7613 	ifnet_decr_iorefcnt(ifp);
7614 
7615 	return retval;
7616 }
7617 
7618 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7619 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7620 {
7621 	errno_t error = 0;
7622 
7623 
7624 	if (ifp->if_set_bpf_tap) {
7625 		/* Get an io reference on the interface if it is attached */
7626 		if (!ifnet_is_attached(ifp, 1)) {
7627 			return ENXIO;
7628 		}
7629 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7630 		ifnet_decr_iorefcnt(ifp);
7631 	}
7632 	return error;
7633 }
7634 
7635 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7636 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7637     struct sockaddr *ll_addr, size_t ll_len)
7638 {
7639 	errno_t result = EOPNOTSUPP;
7640 	struct if_proto *proto;
7641 	const struct sockaddr *verify;
7642 	proto_media_resolve_multi resolvep;
7643 
7644 	if (!ifnet_is_attached(ifp, 1)) {
7645 		return result;
7646 	}
7647 
7648 	bzero(ll_addr, ll_len);
7649 
7650 	/* Call the protocol first; callee holds a proto refcnt upon success */
7651 	ifnet_lock_shared(ifp);
7652 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7653 	ifnet_lock_done(ifp);
7654 	if (proto != NULL) {
7655 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7656 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7657 		if (resolvep != NULL) {
7658 			result = resolvep(ifp, proto_addr,
7659 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7660 		}
7661 		if_proto_free(proto);
7662 	}
7663 
7664 	/* Let the interface verify the multicast address */
7665 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7666 		if (result == 0) {
7667 			verify = ll_addr;
7668 		} else {
7669 			verify = proto_addr;
7670 		}
7671 		result = ifp->if_check_multi(ifp, verify);
7672 	}
7673 
7674 	ifnet_decr_iorefcnt(ifp);
7675 	return result;
7676 }
7677 
7678 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7679 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7680     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7681     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7682 {
7683 	struct if_proto *proto;
7684 	errno_t result = 0;
7685 
7686 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7687 		result = ENOTSUP;
7688 		goto done;
7689 	}
7690 
7691 	/* callee holds a proto refcnt upon success */
7692 	ifnet_lock_shared(ifp);
7693 	proto = find_attached_proto(ifp, target_proto->sa_family);
7694 	ifnet_lock_done(ifp);
7695 	if (proto == NULL) {
7696 		result = ENOTSUP;
7697 	} else {
7698 		proto_media_send_arp    arpp;
7699 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7700 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7701 		if (arpp == NULL) {
7702 			result = ENOTSUP;
7703 		} else {
7704 			switch (arpop) {
7705 			case ARPOP_REQUEST:
7706 				arpstat.txrequests++;
7707 				if (target_hw != NULL) {
7708 					arpstat.txurequests++;
7709 				}
7710 				break;
7711 			case ARPOP_REPLY:
7712 				arpstat.txreplies++;
7713 				break;
7714 			}
7715 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7716 			    target_hw, target_proto);
7717 		}
7718 		if_proto_free(proto);
7719 	}
7720 done:
7721 	return result;
7722 }
7723 
7724 struct net_thread_marks { };
7725 static const struct net_thread_marks net_thread_marks_base = { };
7726 
7727 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7728     &net_thread_marks_base;
7729 
7730 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7731 net_thread_marks_push(u_int32_t push)
7732 {
7733 	static const char *const base = (const void*)&net_thread_marks_base;
7734 	u_int32_t pop = 0;
7735 
7736 	if (push != 0) {
7737 		struct uthread *uth = current_uthread();
7738 
7739 		pop = push & ~uth->uu_network_marks;
7740 		if (pop != 0) {
7741 			uth->uu_network_marks |= pop;
7742 		}
7743 	}
7744 
7745 	return (net_thread_marks_t)&base[pop];
7746 }
7747 
7748 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7749 net_thread_unmarks_push(u_int32_t unpush)
7750 {
7751 	static const char *const base = (const void*)&net_thread_marks_base;
7752 	u_int32_t unpop = 0;
7753 
7754 	if (unpush != 0) {
7755 		struct uthread *uth = current_uthread();
7756 
7757 		unpop = unpush & uth->uu_network_marks;
7758 		if (unpop != 0) {
7759 			uth->uu_network_marks &= ~unpop;
7760 		}
7761 	}
7762 
7763 	return (net_thread_marks_t)&base[unpop];
7764 }
7765 
7766 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7767 net_thread_marks_pop(net_thread_marks_t popx)
7768 {
7769 	static const char *const base = (const void*)&net_thread_marks_base;
7770 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7771 
7772 	if (pop != 0) {
7773 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7774 		struct uthread *uth = current_uthread();
7775 
7776 		VERIFY((pop & ones) == pop);
7777 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7778 		uth->uu_network_marks &= ~pop;
7779 	}
7780 }
7781 
7782 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7783 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7784 {
7785 	static const char *const base = (const void*)&net_thread_marks_base;
7786 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7787 
7788 	if (unpop != 0) {
7789 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7790 		struct uthread *uth = current_uthread();
7791 
7792 		VERIFY((unpop & ones) == unpop);
7793 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7794 		uth->uu_network_marks |= unpop;
7795 	}
7796 }
7797 
7798 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7799 net_thread_is_marked(u_int32_t check)
7800 {
7801 	if (check != 0) {
7802 		struct uthread *uth = current_uthread();
7803 		return uth->uu_network_marks & check;
7804 	} else {
7805 		return 0;
7806 	}
7807 }
7808 
7809 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7810 net_thread_is_unmarked(u_int32_t check)
7811 {
7812 	if (check != 0) {
7813 		struct uthread *uth = current_uthread();
7814 		return ~uth->uu_network_marks & check;
7815 	} else {
7816 		return 0;
7817 	}
7818 }
7819 
7820 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7821 _is_announcement(const struct sockaddr_in * sender_sin,
7822     const struct sockaddr_in * target_sin)
7823 {
7824 	if (target_sin == NULL || sender_sin == NULL) {
7825 		return FALSE;
7826 	}
7827 
7828 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7829 }
7830 
7831 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7832 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7833     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7834     const struct sockaddr *target_proto0, u_int32_t rtflags)
7835 {
7836 	errno_t result = 0;
7837 	const struct sockaddr_in * sender_sin;
7838 	const struct sockaddr_in * target_sin;
7839 	struct sockaddr_inarp target_proto_sinarp;
7840 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7841 
7842 	if (target_proto == NULL || sender_proto == NULL) {
7843 		return EINVAL;
7844 	}
7845 
7846 	if (sender_proto->sa_family != target_proto->sa_family) {
7847 		return EINVAL;
7848 	}
7849 
7850 	/*
7851 	 * If the target is a (default) router, provide that
7852 	 * information to the send_arp callback routine.
7853 	 */
7854 	if (rtflags & RTF_ROUTER) {
7855 		bcopy(target_proto, &target_proto_sinarp,
7856 		    sizeof(struct sockaddr_in));
7857 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7858 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7859 	}
7860 
7861 	/*
7862 	 * If this is an ARP request and the target IP is IPv4LL,
7863 	 * send the request on all interfaces.  The exception is
7864 	 * an announcement, which must only appear on the specific
7865 	 * interface.
7866 	 */
7867 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7868 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7869 	if (target_proto->sa_family == AF_INET &&
7870 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7871 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7872 	    !_is_announcement(sender_sin, target_sin)) {
7873 		ifnet_t         *ifp_list;
7874 		u_int32_t       count;
7875 		u_int32_t       ifp_on;
7876 
7877 		result = ENOTSUP;
7878 
7879 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7880 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7881 				errno_t new_result;
7882 				ifaddr_t source_hw = NULL;
7883 				ifaddr_t source_ip = NULL;
7884 				struct sockaddr_in source_ip_copy;
7885 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7886 
7887 				/*
7888 				 * Only arp on interfaces marked for IPv4LL
7889 				 * ARPing.  This may mean that we don't ARP on
7890 				 * the interface the subnet route points to.
7891 				 */
7892 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7893 					continue;
7894 				}
7895 
7896 				/* Find the source IP address */
7897 				ifnet_lock_shared(cur_ifp);
7898 				source_hw = cur_ifp->if_lladdr;
7899 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7900 				    ifa_link) {
7901 					IFA_LOCK(source_ip);
7902 					if (source_ip->ifa_addr != NULL &&
7903 					    source_ip->ifa_addr->sa_family ==
7904 					    AF_INET) {
7905 						/* Copy the source IP address */
7906 						source_ip_copy =
7907 						    *(struct sockaddr_in *)
7908 						    (void *)source_ip->ifa_addr;
7909 						IFA_UNLOCK(source_ip);
7910 						break;
7911 					}
7912 					IFA_UNLOCK(source_ip);
7913 				}
7914 
7915 				/* No IP Source, don't arp */
7916 				if (source_ip == NULL) {
7917 					ifnet_lock_done(cur_ifp);
7918 					continue;
7919 				}
7920 
7921 				IFA_ADDREF(source_hw);
7922 				ifnet_lock_done(cur_ifp);
7923 
7924 				/* Send the ARP */
7925 				new_result = dlil_send_arp_internal(cur_ifp,
7926 				    arpop, (struct sockaddr_dl *)(void *)
7927 				    source_hw->ifa_addr,
7928 				    (struct sockaddr *)&source_ip_copy, NULL,
7929 				    target_proto);
7930 
7931 				IFA_REMREF(source_hw);
7932 				if (result == ENOTSUP) {
7933 					result = new_result;
7934 				}
7935 			}
7936 			ifnet_list_free(ifp_list);
7937 		}
7938 	} else {
7939 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7940 		    sender_proto, target_hw, target_proto);
7941 	}
7942 
7943 	return result;
7944 }
7945 
7946 /*
7947  * Caller must hold ifnet head lock.
7948  */
7949 static int
ifnet_lookup(struct ifnet * ifp)7950 ifnet_lookup(struct ifnet *ifp)
7951 {
7952 	struct ifnet *_ifp;
7953 
7954 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7955 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7956 		if (_ifp == ifp) {
7957 			break;
7958 		}
7959 	}
7960 	return _ifp != NULL;
7961 }
7962 
7963 /*
7964  * Caller has to pass a non-zero refio argument to get a
7965  * IO reference count. This will prevent ifnet_detach from
7966  * being called when there are outstanding io reference counts.
7967  */
7968 int
ifnet_is_attached(struct ifnet * ifp,int refio)7969 ifnet_is_attached(struct ifnet *ifp, int refio)
7970 {
7971 	int ret;
7972 
7973 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7974 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7975 		if (refio > 0) {
7976 			ifp->if_refio++;
7977 		}
7978 	}
7979 	lck_mtx_unlock(&ifp->if_ref_lock);
7980 
7981 	return ret;
7982 }
7983 
7984 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7985 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7986 {
7987 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7988 	ifp->if_threads_pending++;
7989 	lck_mtx_unlock(&ifp->if_ref_lock);
7990 }
7991 
7992 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7993 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7994 {
7995 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7996 	VERIFY(ifp->if_threads_pending > 0);
7997 	ifp->if_threads_pending--;
7998 	if (ifp->if_threads_pending == 0) {
7999 		wakeup(&ifp->if_threads_pending);
8000 	}
8001 	lck_mtx_unlock(&ifp->if_ref_lock);
8002 }
8003 
8004 /*
8005  * Caller must ensure the interface is attached; the assumption is that
8006  * there is at least an outstanding IO reference count held already.
8007  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8008  */
8009 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8010 ifnet_incr_iorefcnt(struct ifnet *ifp)
8011 {
8012 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8013 	VERIFY(IF_FULLY_ATTACHED(ifp));
8014 	VERIFY(ifp->if_refio > 0);
8015 	ifp->if_refio++;
8016 	lck_mtx_unlock(&ifp->if_ref_lock);
8017 }
8018 
8019 __attribute__((always_inline))
8020 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8021 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8022 {
8023 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8024 
8025 	VERIFY(ifp->if_refio > 0);
8026 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8027 
8028 	ifp->if_refio--;
8029 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8030 
8031 	/*
8032 	 * if there are no more outstanding io references, wakeup the
8033 	 * ifnet_detach thread if detaching flag is set.
8034 	 */
8035 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8036 		wakeup(&(ifp->if_refio));
8037 	}
8038 }
8039 
8040 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8041 ifnet_decr_iorefcnt(struct ifnet *ifp)
8042 {
8043 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8044 	ifnet_decr_iorefcnt_locked(ifp);
8045 	lck_mtx_unlock(&ifp->if_ref_lock);
8046 }
8047 
8048 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8049 ifnet_datamov_begin(struct ifnet *ifp)
8050 {
8051 	boolean_t ret;
8052 
8053 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8054 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8055 		ifp->if_refio++;
8056 		ifp->if_datamov++;
8057 	}
8058 	lck_mtx_unlock(&ifp->if_ref_lock);
8059 
8060 	return ret;
8061 }
8062 
8063 void
ifnet_datamov_end(struct ifnet * ifp)8064 ifnet_datamov_end(struct ifnet *ifp)
8065 {
8066 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8067 	VERIFY(ifp->if_datamov > 0);
8068 	/*
8069 	 * if there's no more thread moving data, wakeup any
8070 	 * drainers that's blocked waiting for this.
8071 	 */
8072 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8073 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8074 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8075 		wakeup(&(ifp->if_datamov));
8076 	}
8077 	ifnet_decr_iorefcnt_locked(ifp);
8078 	lck_mtx_unlock(&ifp->if_ref_lock);
8079 }
8080 
8081 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8082 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8083 {
8084 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8085 	ifp->if_refio++;
8086 	if (ifp->if_suspend++ == 0) {
8087 		VERIFY(ifp->if_refflags & IFRF_READY);
8088 		ifp->if_refflags &= ~IFRF_READY;
8089 	}
8090 }
8091 
8092 void
ifnet_datamov_suspend(struct ifnet * ifp)8093 ifnet_datamov_suspend(struct ifnet *ifp)
8094 {
8095 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8096 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8097 	ifnet_datamov_suspend_locked(ifp);
8098 	lck_mtx_unlock(&ifp->if_ref_lock);
8099 }
8100 
8101 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8102 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8103 {
8104 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8105 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8106 	if (ifp->if_suspend > 0) {
8107 		lck_mtx_unlock(&ifp->if_ref_lock);
8108 		return FALSE;
8109 	}
8110 	ifnet_datamov_suspend_locked(ifp);
8111 	lck_mtx_unlock(&ifp->if_ref_lock);
8112 	return TRUE;
8113 }
8114 
8115 void
ifnet_datamov_drain(struct ifnet * ifp)8116 ifnet_datamov_drain(struct ifnet *ifp)
8117 {
8118 	lck_mtx_lock(&ifp->if_ref_lock);
8119 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8120 	/* data movement must already be suspended */
8121 	VERIFY(ifp->if_suspend > 0);
8122 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8123 	ifp->if_drainers++;
8124 	while (ifp->if_datamov != 0) {
8125 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8126 		    if_name(ifp));
8127 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8128 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8129 		    (PZERO - 1), __func__, NULL);
8130 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8131 	}
8132 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8133 	VERIFY(ifp->if_drainers > 0);
8134 	ifp->if_drainers--;
8135 	lck_mtx_unlock(&ifp->if_ref_lock);
8136 
8137 	/* purge the interface queues */
8138 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8139 		if_qflush_snd(ifp, false);
8140 	}
8141 }
8142 
8143 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8144 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8145 {
8146 	ifnet_datamov_suspend(ifp);
8147 	ifnet_datamov_drain(ifp);
8148 }
8149 
8150 void
ifnet_datamov_resume(struct ifnet * ifp)8151 ifnet_datamov_resume(struct ifnet *ifp)
8152 {
8153 	lck_mtx_lock(&ifp->if_ref_lock);
8154 	/* data movement must already be suspended */
8155 	VERIFY(ifp->if_suspend > 0);
8156 	if (--ifp->if_suspend == 0) {
8157 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8158 		ifp->if_refflags |= IFRF_READY;
8159 	}
8160 	ifnet_decr_iorefcnt_locked(ifp);
8161 	lck_mtx_unlock(&ifp->if_ref_lock);
8162 }
8163 
8164 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8165 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8166 {
8167 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8168 	ctrace_t *tr;
8169 	u_int32_t idx;
8170 	u_int16_t *cnt;
8171 
8172 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8173 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8174 		/* NOTREACHED */
8175 	}
8176 
8177 	if (refhold) {
8178 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8179 		tr = dl_if_dbg->dldbg_if_refhold;
8180 	} else {
8181 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8182 		tr = dl_if_dbg->dldbg_if_refrele;
8183 	}
8184 
8185 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8186 	ctrace_record(&tr[idx]);
8187 }
8188 
8189 errno_t
dlil_if_ref(struct ifnet * ifp)8190 dlil_if_ref(struct ifnet *ifp)
8191 {
8192 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8193 
8194 	if (dl_if == NULL) {
8195 		return EINVAL;
8196 	}
8197 
8198 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8199 	++dl_if->dl_if_refcnt;
8200 	if (dl_if->dl_if_refcnt == 0) {
8201 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8202 		/* NOTREACHED */
8203 	}
8204 	if (dl_if->dl_if_trace != NULL) {
8205 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8206 	}
8207 	lck_mtx_unlock(&dl_if->dl_if_lock);
8208 
8209 	return 0;
8210 }
8211 
8212 errno_t
dlil_if_free(struct ifnet * ifp)8213 dlil_if_free(struct ifnet *ifp)
8214 {
8215 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8216 	bool need_release = FALSE;
8217 
8218 	if (dl_if == NULL) {
8219 		return EINVAL;
8220 	}
8221 
8222 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8223 	switch (dl_if->dl_if_refcnt) {
8224 	case 0:
8225 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8226 		/* NOTREACHED */
8227 		break;
8228 	case 1:
8229 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8230 			need_release = TRUE;
8231 		}
8232 		break;
8233 	default:
8234 		break;
8235 	}
8236 	--dl_if->dl_if_refcnt;
8237 	if (dl_if->dl_if_trace != NULL) {
8238 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8239 	}
8240 	lck_mtx_unlock(&dl_if->dl_if_lock);
8241 	if (need_release) {
8242 		_dlil_if_release(ifp, true);
8243 	}
8244 	return 0;
8245 }
8246 
8247 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8248 dlil_attach_protocol(struct if_proto *proto,
8249     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8250     uint32_t * proto_count)
8251 {
8252 	struct kev_dl_proto_data ev_pr_data;
8253 	struct ifnet *ifp = proto->ifp;
8254 	errno_t retval = 0;
8255 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8256 	struct if_proto *prev_proto;
8257 	struct if_proto *_proto;
8258 
8259 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8260 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8261 		return EINVAL;
8262 	}
8263 
8264 	if (!ifnet_is_attached(ifp, 1)) {
8265 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8266 		    __func__, if_name(ifp));
8267 		return ENXIO;
8268 	}
8269 	/* callee holds a proto refcnt upon success */
8270 	ifnet_lock_exclusive(ifp);
8271 	_proto = find_attached_proto(ifp, proto->protocol_family);
8272 	if (_proto != NULL) {
8273 		ifnet_lock_done(ifp);
8274 		if_proto_free(_proto);
8275 		retval = EEXIST;
8276 		goto ioref_done;
8277 	}
8278 
8279 	/*
8280 	 * Call family module add_proto routine so it can refine the
8281 	 * demux descriptors as it wishes.
8282 	 */
8283 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8284 	    demux_count);
8285 	if (retval) {
8286 		ifnet_lock_done(ifp);
8287 		goto ioref_done;
8288 	}
8289 
8290 	/*
8291 	 * Insert the protocol in the hash
8292 	 */
8293 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8294 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8295 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8296 	}
8297 	if (prev_proto) {
8298 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8299 	} else {
8300 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8301 		    proto, next_hash);
8302 	}
8303 
8304 	/* hold a proto refcnt for attach */
8305 	if_proto_ref(proto);
8306 
8307 	/*
8308 	 * The reserved field carries the number of protocol still attached
8309 	 * (subject to change)
8310 	 */
8311 	ev_pr_data.proto_family = proto->protocol_family;
8312 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8313 
8314 	ifnet_lock_done(ifp);
8315 
8316 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8317 	    (struct net_event_data *)&ev_pr_data,
8318 	    sizeof(struct kev_dl_proto_data), FALSE);
8319 	if (proto_count != NULL) {
8320 		*proto_count = ev_pr_data.proto_remaining_count;
8321 	}
8322 ioref_done:
8323 	ifnet_decr_iorefcnt(ifp);
8324 	return retval;
8325 }
8326 
8327 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8328 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8329 {
8330 	/*
8331 	 * A protocol has been attached, mark the interface up.
8332 	 * This used to be done by configd.KernelEventMonitor, but that
8333 	 * is inherently prone to races (rdar://problem/30810208).
8334 	 */
8335 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8336 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8337 	dlil_post_sifflags_msg(ifp);
8338 #if SKYWALK
8339 	switch (protocol) {
8340 	case AF_INET:
8341 	case AF_INET6:
8342 		/* don't attach the flowswitch unless attaching IP */
8343 		dlil_attach_flowswitch_nexus(ifp);
8344 		break;
8345 	default:
8346 		break;
8347 	}
8348 #endif /* SKYWALK */
8349 }
8350 
8351 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8352 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8353     const struct ifnet_attach_proto_param *proto_details)
8354 {
8355 	int retval = 0;
8356 	struct if_proto  *ifproto = NULL;
8357 	uint32_t proto_count = 0;
8358 
8359 	ifnet_head_lock_shared();
8360 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8361 		retval = EINVAL;
8362 		goto end;
8363 	}
8364 	/* Check that the interface is in the global list */
8365 	if (!ifnet_lookup(ifp)) {
8366 		retval = ENXIO;
8367 		goto end;
8368 	}
8369 
8370 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8371 
8372 	/* refcnt held above during lookup */
8373 	ifproto->ifp = ifp;
8374 	ifproto->protocol_family = protocol;
8375 	ifproto->proto_kpi = kProtoKPI_v1;
8376 	ifproto->kpi.v1.input = proto_details->input;
8377 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8378 	ifproto->kpi.v1.event = proto_details->event;
8379 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8380 	ifproto->kpi.v1.detached = proto_details->detached;
8381 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8382 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8383 
8384 	retval = dlil_attach_protocol(ifproto,
8385 	    proto_details->demux_list, proto_details->demux_count,
8386 	    &proto_count);
8387 
8388 end:
8389 	if (retval == EEXIST) {
8390 		/* already attached */
8391 		if (dlil_verbose) {
8392 			DLIL_PRINTF("%s: protocol %d already attached\n",
8393 			    ifp != NULL ? if_name(ifp) : "N/A",
8394 			    protocol);
8395 		}
8396 	} else if (retval != 0) {
8397 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8398 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8399 	} else if (dlil_verbose) {
8400 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8401 		    ifp != NULL ? if_name(ifp) : "N/A",
8402 		    protocol, proto_count);
8403 	}
8404 	ifnet_head_done();
8405 	if (retval == 0) {
8406 		dlil_handle_proto_attach(ifp, protocol);
8407 	} else if (ifproto != NULL) {
8408 		zfree(dlif_proto_zone, ifproto);
8409 	}
8410 	return retval;
8411 }
8412 
8413 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8414 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8415     const struct ifnet_attach_proto_param_v2 *proto_details)
8416 {
8417 	int retval = 0;
8418 	struct if_proto  *ifproto = NULL;
8419 	uint32_t proto_count = 0;
8420 
8421 	ifnet_head_lock_shared();
8422 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8423 		retval = EINVAL;
8424 		goto end;
8425 	}
8426 	/* Check that the interface is in the global list */
8427 	if (!ifnet_lookup(ifp)) {
8428 		retval = ENXIO;
8429 		goto end;
8430 	}
8431 
8432 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8433 
8434 	/* refcnt held above during lookup */
8435 	ifproto->ifp = ifp;
8436 	ifproto->protocol_family = protocol;
8437 	ifproto->proto_kpi = kProtoKPI_v2;
8438 	ifproto->kpi.v2.input = proto_details->input;
8439 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8440 	ifproto->kpi.v2.event = proto_details->event;
8441 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8442 	ifproto->kpi.v2.detached = proto_details->detached;
8443 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8444 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8445 
8446 	retval = dlil_attach_protocol(ifproto,
8447 	    proto_details->demux_list, proto_details->demux_count,
8448 	    &proto_count);
8449 
8450 end:
8451 	if (retval == EEXIST) {
8452 		/* already attached */
8453 		if (dlil_verbose) {
8454 			DLIL_PRINTF("%s: protocol %d already attached\n",
8455 			    ifp != NULL ? if_name(ifp) : "N/A",
8456 			    protocol);
8457 		}
8458 	} else if (retval != 0) {
8459 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8460 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8461 	} else if (dlil_verbose) {
8462 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8463 		    ifp != NULL ? if_name(ifp) : "N/A",
8464 		    protocol, proto_count);
8465 	}
8466 	ifnet_head_done();
8467 	if (retval == 0) {
8468 		dlil_handle_proto_attach(ifp, protocol);
8469 	} else if (ifproto != NULL) {
8470 		zfree(dlif_proto_zone, ifproto);
8471 	}
8472 	return retval;
8473 }
8474 
8475 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8476 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8477 {
8478 	struct if_proto *proto = NULL;
8479 	int     retval = 0;
8480 
8481 	if (ifp == NULL || proto_family == 0) {
8482 		retval = EINVAL;
8483 		goto end;
8484 	}
8485 
8486 	ifnet_lock_exclusive(ifp);
8487 	/* callee holds a proto refcnt upon success */
8488 	proto = find_attached_proto(ifp, proto_family);
8489 	if (proto == NULL) {
8490 		retval = ENXIO;
8491 		ifnet_lock_done(ifp);
8492 		goto end;
8493 	}
8494 
8495 	/* call family module del_proto */
8496 	if (ifp->if_del_proto) {
8497 		ifp->if_del_proto(ifp, proto->protocol_family);
8498 	}
8499 
8500 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8501 	    proto, if_proto, next_hash);
8502 
8503 	if (proto->proto_kpi == kProtoKPI_v1) {
8504 		proto->kpi.v1.input = ifproto_media_input_v1;
8505 		proto->kpi.v1.pre_output = ifproto_media_preout;
8506 		proto->kpi.v1.event = ifproto_media_event;
8507 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8508 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8509 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8510 	} else {
8511 		proto->kpi.v2.input = ifproto_media_input_v2;
8512 		proto->kpi.v2.pre_output = ifproto_media_preout;
8513 		proto->kpi.v2.event = ifproto_media_event;
8514 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8515 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8516 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8517 	}
8518 	proto->detached = 1;
8519 	ifnet_lock_done(ifp);
8520 
8521 	if (dlil_verbose) {
8522 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8523 		    (proto->proto_kpi == kProtoKPI_v1) ?
8524 		    "v1" : "v2", proto_family);
8525 	}
8526 
8527 	/* release proto refcnt held during protocol attach */
8528 	if_proto_free(proto);
8529 
8530 	/*
8531 	 * Release proto refcnt held during lookup; the rest of
8532 	 * protocol detach steps will happen when the last proto
8533 	 * reference is released.
8534 	 */
8535 	if_proto_free(proto);
8536 
8537 end:
8538 	return retval;
8539 }
8540 
8541 
8542 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8543 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8544     struct mbuf *packet, char *header)
8545 {
8546 #pragma unused(ifp, protocol, packet, header)
8547 	return ENXIO;
8548 }
8549 
8550 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8551 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8552     struct mbuf *packet)
8553 {
8554 #pragma unused(ifp, protocol, packet)
8555 	return ENXIO;
8556 }
8557 
8558 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8559 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8560     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8561     char *link_layer_dest)
8562 {
8563 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8564 	return ENXIO;
8565 }
8566 
8567 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8568 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8569     const struct kev_msg *event)
8570 {
8571 #pragma unused(ifp, protocol, event)
8572 }
8573 
8574 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8575 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8576     unsigned long command, void *argument)
8577 {
8578 #pragma unused(ifp, protocol, command, argument)
8579 	return ENXIO;
8580 }
8581 
8582 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8583 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8584     struct sockaddr_dl *out_ll, size_t ll_len)
8585 {
8586 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8587 	return ENXIO;
8588 }
8589 
8590 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8591 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8592     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8593     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8594 {
8595 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8596 	return ENXIO;
8597 }
8598 
8599 extern int if_next_index(void);
8600 extern int tcp_ecn_outbound;
8601 
8602 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8603 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8604 {
8605 	uint32_t sflags = 0;
8606 	int err;
8607 
8608 	if (if_flowadv) {
8609 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8610 	}
8611 
8612 	if (if_delaybased_queue) {
8613 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8614 	}
8615 
8616 	if (ifp->if_output_sched_model ==
8617 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8618 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8619 	}
8620 	/* Inherit drop limit from the default queue */
8621 	if (ifp->if_snd != ifcq) {
8622 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8623 	}
8624 	/* Initialize transmit queue(s) */
8625 	err = ifclassq_setup(ifcq, ifp, sflags);
8626 	if (err != 0) {
8627 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8628 		    "err=%d", __func__, ifp, err);
8629 		/* NOTREACHED */
8630 	}
8631 }
8632 
8633 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8634 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8635 {
8636 #if SKYWALK
8637 	boolean_t netif_compat;
8638 	if_nexus_netif  nexus_netif;
8639 #endif /* SKYWALK */
8640 	struct ifnet *tmp_if;
8641 	struct ifaddr *ifa;
8642 	struct if_data_internal if_data_saved;
8643 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8644 	struct dlil_threading_info *dl_inp;
8645 	thread_continue_t thfunc = NULL;
8646 	int err;
8647 
8648 	if (ifp == NULL) {
8649 		return EINVAL;
8650 	}
8651 
8652 	/*
8653 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8654 	 * prevent the interface from being configured while it is
8655 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8656 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8657 	 */
8658 	dlil_if_lock();
8659 	ifnet_head_lock_exclusive();
8660 	/* Verify we aren't already on the list */
8661 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8662 		if (tmp_if == ifp) {
8663 			ifnet_head_done();
8664 			dlil_if_unlock();
8665 			return EEXIST;
8666 		}
8667 	}
8668 
8669 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8670 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8671 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8672 		    __func__, ifp);
8673 		/* NOTREACHED */
8674 	}
8675 	lck_mtx_unlock(&ifp->if_ref_lock);
8676 
8677 	ifnet_lock_exclusive(ifp);
8678 
8679 	/* Sanity check */
8680 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8681 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8682 	VERIFY(ifp->if_threads_pending == 0);
8683 
8684 	if (ll_addr != NULL) {
8685 		if (ifp->if_addrlen == 0) {
8686 			ifp->if_addrlen = ll_addr->sdl_alen;
8687 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8688 			ifnet_lock_done(ifp);
8689 			ifnet_head_done();
8690 			dlil_if_unlock();
8691 			return EINVAL;
8692 		}
8693 	}
8694 
8695 	/*
8696 	 * Allow interfaces without protocol families to attach
8697 	 * only if they have the necessary fields filled out.
8698 	 */
8699 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8700 		DLIL_PRINTF("%s: Attempt to attach interface without "
8701 		    "family module - %d\n", __func__, ifp->if_family);
8702 		ifnet_lock_done(ifp);
8703 		ifnet_head_done();
8704 		dlil_if_unlock();
8705 		return ENODEV;
8706 	}
8707 
8708 	/* Allocate protocol hash table */
8709 	VERIFY(ifp->if_proto_hash == NULL);
8710 	ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8711 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
8712 
8713 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8714 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8715 	TAILQ_INIT(&ifp->if_flt_head);
8716 	VERIFY(ifp->if_flt_busy == 0);
8717 	VERIFY(ifp->if_flt_waiters == 0);
8718 	VERIFY(ifp->if_flt_non_os_count == 0);
8719 	VERIFY(ifp->if_flt_no_tso_count == 0);
8720 	lck_mtx_unlock(&ifp->if_flt_lock);
8721 
8722 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8723 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8724 		LIST_INIT(&ifp->if_multiaddrs);
8725 	}
8726 
8727 	VERIFY(ifp->if_allhostsinm == NULL);
8728 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8729 	TAILQ_INIT(&ifp->if_addrhead);
8730 
8731 	if (ifp->if_index == 0) {
8732 		int idx = if_next_index();
8733 
8734 		/*
8735 		 * Since we exhausted the list of
8736 		 * if_index's, try to find an empty slot
8737 		 * in ifindex2ifnet.
8738 		 */
8739 		if (idx == -1 && if_index >= UINT16_MAX) {
8740 			for (int i = 1; i < if_index; i++) {
8741 				if (ifindex2ifnet[i] == NULL &&
8742 				    ifnet_addrs[i - 1] == NULL) {
8743 					idx = i;
8744 					break;
8745 				}
8746 			}
8747 		}
8748 		if (idx == -1) {
8749 			ifp->if_index = 0;
8750 			ifnet_lock_done(ifp);
8751 			ifnet_head_done();
8752 			dlil_if_unlock();
8753 			return ENOBUFS;
8754 		}
8755 		ifp->if_index = (uint16_t)idx;
8756 
8757 		/* the lladdr passed at attach time is the permanent address */
8758 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8759 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8760 			bcopy(CONST_LLADDR(ll_addr),
8761 			    dl_if->dl_if_permanent_ether,
8762 			    ETHER_ADDR_LEN);
8763 			dl_if->dl_if_permanent_ether_is_set = 1;
8764 		}
8765 	}
8766 	/* There should not be anything occupying this slot */
8767 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8768 
8769 	/* allocate (if needed) and initialize a link address */
8770 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8771 	if (ifa == NULL) {
8772 		ifnet_lock_done(ifp);
8773 		ifnet_head_done();
8774 		dlil_if_unlock();
8775 		return ENOBUFS;
8776 	}
8777 
8778 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8779 	ifnet_addrs[ifp->if_index - 1] = ifa;
8780 
8781 	/* make this address the first on the list */
8782 	IFA_LOCK(ifa);
8783 	/* hold a reference for ifnet_addrs[] */
8784 	IFA_ADDREF_LOCKED(ifa);
8785 	/* if_attach_link_ifa() holds a reference for ifa_link */
8786 	if_attach_link_ifa(ifp, ifa);
8787 	IFA_UNLOCK(ifa);
8788 
8789 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8790 	ifindex2ifnet[ifp->if_index] = ifp;
8791 
8792 	/* Hold a reference to the underlying dlil_ifnet */
8793 	ifnet_reference(ifp);
8794 
8795 	/* Clear stats (save and restore other fields that we care) */
8796 	if_data_saved = ifp->if_data;
8797 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8798 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8799 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8800 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8801 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8802 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8803 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8804 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8805 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8806 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8807 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8808 	ifnet_touch_lastchange(ifp);
8809 
8810 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8811 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8812 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8813 
8814 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8815 
8816 	/* Sanity checks on the input thread storage */
8817 	dl_inp = &dl_if->dl_if_inpstorage;
8818 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8819 	VERIFY(dl_inp->dlth_flags == 0);
8820 	VERIFY(dl_inp->dlth_wtot == 0);
8821 	VERIFY(dl_inp->dlth_ifp == NULL);
8822 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8823 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8824 	VERIFY(!dl_inp->dlth_affinity);
8825 	VERIFY(ifp->if_inp == NULL);
8826 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8827 	VERIFY(dl_inp->dlth_strategy == NULL);
8828 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8829 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8830 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8831 
8832 #if IFNET_INPUT_SANITY_CHK
8833 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8834 #endif /* IFNET_INPUT_SANITY_CHK */
8835 
8836 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8837 	dlil_reset_rxpoll_params(ifp);
8838 	/*
8839 	 * A specific DLIL input thread is created per non-loopback interface.
8840 	 */
8841 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8842 		ifp->if_inp = dl_inp;
8843 		ifnet_incr_pending_thread_count(ifp);
8844 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8845 		if (err == ENODEV) {
8846 			VERIFY(thfunc == NULL);
8847 			ifnet_decr_pending_thread_count(ifp);
8848 		} else if (err != 0) {
8849 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8850 			    "err=%d", __func__, ifp, err);
8851 			/* NOTREACHED */
8852 		}
8853 	}
8854 	/*
8855 	 * If the driver supports the new transmit model, calculate flow hash
8856 	 * and create a workloop starter thread to invoke the if_start callback
8857 	 * where the packets may be dequeued and transmitted.
8858 	 */
8859 	if (ifp->if_eflags & IFEF_TXSTART) {
8860 		thread_precedence_policy_data_t info;
8861 		__unused kern_return_t kret;
8862 
8863 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8864 		VERIFY(ifp->if_flowhash != 0);
8865 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8866 
8867 		ifnet_set_start_cycle(ifp, NULL);
8868 		ifp->if_start_active = 0;
8869 		ifp->if_start_req = 0;
8870 		ifp->if_start_flags = 0;
8871 		VERIFY(ifp->if_start != NULL);
8872 		ifnet_incr_pending_thread_count(ifp);
8873 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8874 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8875 			panic_plain("%s: "
8876 			    "ifp=%p couldn't get a start thread; "
8877 			    "err=%d", __func__, ifp, err);
8878 			/* NOTREACHED */
8879 		}
8880 		bzero(&info, sizeof(info));
8881 		info.importance = 1;
8882 		kret = thread_policy_set(ifp->if_start_thread,
8883 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8884 		    THREAD_PRECEDENCE_POLICY_COUNT);
8885 		ASSERT(kret == KERN_SUCCESS);
8886 	} else {
8887 		ifp->if_flowhash = 0;
8888 	}
8889 
8890 	/* Reset polling parameters */
8891 	ifnet_set_poll_cycle(ifp, NULL);
8892 	ifp->if_poll_update = 0;
8893 	ifp->if_poll_flags = 0;
8894 	ifp->if_poll_req = 0;
8895 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8896 
8897 	/*
8898 	 * If the driver supports the new receive model, create a poller
8899 	 * thread to invoke if_input_poll callback where the packets may
8900 	 * be dequeued from the driver and processed for reception.
8901 	 * if the interface is netif compat then the poller thread is
8902 	 * managed by netif.
8903 	 */
8904 	if (thfunc == dlil_rxpoll_input_thread_func) {
8905 		thread_precedence_policy_data_t info;
8906 		__unused kern_return_t kret;
8907 #if SKYWALK
8908 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8909 #endif /* SKYWALK */
8910 		VERIFY(ifp->if_input_poll != NULL);
8911 		VERIFY(ifp->if_input_ctl != NULL);
8912 		ifnet_incr_pending_thread_count(ifp);
8913 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8914 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8915 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8916 			    "err=%d", __func__, ifp, err);
8917 			/* NOTREACHED */
8918 		}
8919 		bzero(&info, sizeof(info));
8920 		info.importance = 1;
8921 		kret = thread_policy_set(ifp->if_poll_thread,
8922 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8923 		    THREAD_PRECEDENCE_POLICY_COUNT);
8924 		ASSERT(kret == KERN_SUCCESS);
8925 	}
8926 
8927 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8928 	VERIFY(ifp->if_desc.ifd_len == 0);
8929 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8930 
8931 	/* Record attach PC stacktrace */
8932 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8933 
8934 	ifp->if_updatemcasts = 0;
8935 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8936 		struct ifmultiaddr *ifma;
8937 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8938 			IFMA_LOCK(ifma);
8939 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8940 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8941 				ifp->if_updatemcasts++;
8942 			}
8943 			IFMA_UNLOCK(ifma);
8944 		}
8945 
8946 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8947 		    "membership(s)\n", if_name(ifp),
8948 		    ifp->if_updatemcasts);
8949 	}
8950 
8951 	/* Clear logging parameters */
8952 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8953 
8954 	/* Clear foreground/realtime activity timestamps */
8955 	ifp->if_fg_sendts = 0;
8956 	ifp->if_rt_sendts = 0;
8957 
8958 	/* Clear throughput estimates and radio type */
8959 	ifp->if_estimated_up_bucket = 0;
8960 	ifp->if_estimated_down_bucket = 0;
8961 	ifp->if_radio_type = 0;
8962 	ifp->if_radio_channel = 0;
8963 
8964 	VERIFY(ifp->if_delegated.ifp == NULL);
8965 	VERIFY(ifp->if_delegated.type == 0);
8966 	VERIFY(ifp->if_delegated.family == 0);
8967 	VERIFY(ifp->if_delegated.subfamily == 0);
8968 	VERIFY(ifp->if_delegated.expensive == 0);
8969 	VERIFY(ifp->if_delegated.constrained == 0);
8970 
8971 	VERIFY(ifp->if_agentids == NULL);
8972 	VERIFY(ifp->if_agentcount == 0);
8973 
8974 	/* Reset interface state */
8975 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8976 	ifp->if_interface_state.valid_bitmask |=
8977 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8978 	ifp->if_interface_state.interface_availability =
8979 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8980 
8981 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8982 	if (ifp == lo_ifp) {
8983 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8984 		ifp->if_interface_state.valid_bitmask |=
8985 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8986 	} else {
8987 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8988 	}
8989 
8990 	/*
8991 	 * Enable ECN capability on this interface depending on the
8992 	 * value of ECN global setting
8993 	 */
8994 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8995 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8996 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8997 	}
8998 
8999 	/*
9000 	 * Built-in Cyclops always on policy for WiFi infra
9001 	 */
9002 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9003 		errno_t error;
9004 
9005 		error = if_set_qosmarking_mode(ifp,
9006 		    IFRTYPE_QOSMARKING_FASTLANE);
9007 		if (error != 0) {
9008 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9009 			    __func__, ifp->if_xname, error);
9010 		} else {
9011 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9012 #if (DEVELOPMENT || DEBUG)
9013 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9014 			    __func__, ifp->if_xname);
9015 #endif /* (DEVELOPMENT || DEBUG) */
9016 		}
9017 	}
9018 
9019 	ifnet_lock_done(ifp);
9020 	ifnet_head_done();
9021 
9022 #if SKYWALK
9023 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9024 #endif /* SKYWALK */
9025 
9026 	lck_mtx_lock(&ifp->if_cached_route_lock);
9027 	/* Enable forwarding cached route */
9028 	ifp->if_fwd_cacheok = 1;
9029 	/* Clean up any existing cached routes */
9030 	ROUTE_RELEASE(&ifp->if_fwd_route);
9031 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9032 	ROUTE_RELEASE(&ifp->if_src_route);
9033 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9034 	ROUTE_RELEASE(&ifp->if_src_route6);
9035 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9036 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9037 
9038 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9039 
9040 	/*
9041 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9042 	 * and trees; do this before the ifnet is marked as attached.
9043 	 * The ifnet keeps the reference to the info structures even after
9044 	 * the ifnet is detached, since the network-layer records still
9045 	 * refer to the info structures even after that.  This also
9046 	 * makes it possible for them to still function after the ifnet
9047 	 * is recycled or reattached.
9048 	 */
9049 #if INET
9050 	if (IGMP_IFINFO(ifp) == NULL) {
9051 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9052 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9053 	} else {
9054 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9055 		igmp_domifreattach(IGMP_IFINFO(ifp));
9056 	}
9057 #endif /* INET */
9058 	if (MLD_IFINFO(ifp) == NULL) {
9059 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9060 		VERIFY(MLD_IFINFO(ifp) != NULL);
9061 	} else {
9062 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9063 		mld_domifreattach(MLD_IFINFO(ifp));
9064 	}
9065 
9066 	VERIFY(ifp->if_data_threshold == 0);
9067 	VERIFY(ifp->if_dt_tcall != NULL);
9068 
9069 	/*
9070 	 * Wait for the created kernel threads for I/O to get
9071 	 * scheduled and run at least once before we proceed
9072 	 * to mark interface as attached.
9073 	 */
9074 	lck_mtx_lock(&ifp->if_ref_lock);
9075 	while (ifp->if_threads_pending != 0) {
9076 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9077 		    "interface %s to get scheduled at least once.\n",
9078 		    __func__, ifp->if_xname);
9079 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9080 		    __func__, NULL);
9081 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9082 	}
9083 	lck_mtx_unlock(&ifp->if_ref_lock);
9084 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9085 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9086 
9087 	/* Final mark this ifnet as attached. */
9088 	ifnet_lock_exclusive(ifp);
9089 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9090 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9091 	lck_mtx_unlock(&ifp->if_ref_lock);
9092 	if (net_rtref) {
9093 		/* boot-args override; enable idle notification */
9094 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9095 		    IFRF_IDLE_NOTIFY);
9096 	} else {
9097 		/* apply previous request(s) to set the idle flags, if any */
9098 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9099 		    ifp->if_idle_new_flags_mask);
9100 	}
9101 #if SKYWALK
9102 	/* the interface is fully attached; let the nexus adapter know */
9103 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9104 		if (netif_compat) {
9105 			if (sk_netif_compat_txmodel ==
9106 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9107 				ifnet_enqueue_multi_setup(ifp,
9108 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9109 			}
9110 			ifp->if_nx_netif = nexus_netif;
9111 		}
9112 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9113 	}
9114 #endif /* SKYWALK */
9115 	ifnet_lock_done(ifp);
9116 	dlil_if_unlock();
9117 
9118 #if PF
9119 	/*
9120 	 * Attach packet filter to this interface, if enabled.
9121 	 */
9122 	pf_ifnet_hook(ifp, 1);
9123 #endif /* PF */
9124 
9125 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9126 
9127 	if (dlil_verbose) {
9128 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9129 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9130 	}
9131 
9132 	return 0;
9133 }
9134 
9135 /*
9136  * Prepare the storage for the first/permanent link address, which must
9137  * must have the same lifetime as the ifnet itself.  Although the link
9138  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9139  * its location in memory must never change as it may still be referred
9140  * to by some parts of the system afterwards (unfortunate implementation
9141  * artifacts inherited from BSD.)
9142  *
9143  * Caller must hold ifnet lock as writer.
9144  */
9145 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9146 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9147 {
9148 	struct ifaddr *ifa, *oifa;
9149 	struct sockaddr_dl *asdl, *msdl;
9150 	char workbuf[IFNAMSIZ * 2];
9151 	int namelen, masklen, socksize;
9152 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9153 
9154 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9155 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9156 
9157 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9158 	    if_name(ifp));
9159 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9160 	    + ((namelen > 0) ? namelen : 0);
9161 	socksize = masklen + ifp->if_addrlen;
9162 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9163 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9164 		socksize = sizeof(struct sockaddr_dl);
9165 	}
9166 	socksize = ROUNDUP(socksize);
9167 #undef ROUNDUP
9168 
9169 	ifa = ifp->if_lladdr;
9170 	if (socksize > DLIL_SDLMAXLEN ||
9171 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9172 		/*
9173 		 * Rare, but in the event that the link address requires
9174 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9175 		 * largest possible storages for address and mask, such
9176 		 * that we can reuse the same space when if_addrlen grows.
9177 		 * This same space will be used when if_addrlen shrinks.
9178 		 */
9179 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9180 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9181 
9182 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9183 			ifa_lock_init(ifa);
9184 			/* Don't set IFD_ALLOC, as this is permanent */
9185 			ifa->ifa_debug = IFD_LINK;
9186 		}
9187 		IFA_LOCK(ifa);
9188 		/* address and mask sockaddr_dl locations */
9189 		asdl = (struct sockaddr_dl *)(ifa + 1);
9190 		bzero(asdl, SOCK_MAXADDRLEN);
9191 		msdl = (struct sockaddr_dl *)(void *)
9192 		    ((char *)asdl + SOCK_MAXADDRLEN);
9193 		bzero(msdl, SOCK_MAXADDRLEN);
9194 	} else {
9195 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9196 		/*
9197 		 * Use the storage areas for address and mask within the
9198 		 * dlil_ifnet structure.  This is the most common case.
9199 		 */
9200 		if (ifa == NULL) {
9201 			ifa = &dl_if->dl_if_lladdr.ifa;
9202 			ifa_lock_init(ifa);
9203 			/* Don't set IFD_ALLOC, as this is permanent */
9204 			ifa->ifa_debug = IFD_LINK;
9205 		}
9206 		IFA_LOCK(ifa);
9207 		/* address and mask sockaddr_dl locations */
9208 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9209 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9210 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9211 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9212 	}
9213 
9214 	/* hold a permanent reference for the ifnet itself */
9215 	IFA_ADDREF_LOCKED(ifa);
9216 	oifa = ifp->if_lladdr;
9217 	ifp->if_lladdr = ifa;
9218 
9219 	VERIFY(ifa->ifa_debug == IFD_LINK);
9220 	ifa->ifa_ifp = ifp;
9221 	ifa->ifa_rtrequest = link_rtrequest;
9222 	ifa->ifa_addr = (struct sockaddr *)asdl;
9223 	asdl->sdl_len = (u_char)socksize;
9224 	asdl->sdl_family = AF_LINK;
9225 	if (namelen > 0) {
9226 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9227 		    sizeof(asdl->sdl_data)));
9228 		asdl->sdl_nlen = (u_char)namelen;
9229 	} else {
9230 		asdl->sdl_nlen = 0;
9231 	}
9232 	asdl->sdl_index = ifp->if_index;
9233 	asdl->sdl_type = ifp->if_type;
9234 	if (ll_addr != NULL) {
9235 		asdl->sdl_alen = ll_addr->sdl_alen;
9236 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9237 	} else {
9238 		asdl->sdl_alen = 0;
9239 	}
9240 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9241 	msdl->sdl_len = (u_char)masklen;
9242 	while (namelen > 0) {
9243 		msdl->sdl_data[--namelen] = 0xff;
9244 	}
9245 	IFA_UNLOCK(ifa);
9246 
9247 	if (oifa != NULL) {
9248 		IFA_REMREF(oifa);
9249 	}
9250 
9251 	return ifa;
9252 }
9253 
9254 static void
if_purgeaddrs(struct ifnet * ifp)9255 if_purgeaddrs(struct ifnet *ifp)
9256 {
9257 #if INET
9258 	in_purgeaddrs(ifp);
9259 #endif /* INET */
9260 	in6_purgeaddrs(ifp);
9261 }
9262 
9263 errno_t
ifnet_detach(ifnet_t ifp)9264 ifnet_detach(ifnet_t ifp)
9265 {
9266 	struct ifnet *delegated_ifp;
9267 	struct nd_ifinfo *ndi = NULL;
9268 
9269 	if (ifp == NULL) {
9270 		return EINVAL;
9271 	}
9272 
9273 	ndi = ND_IFINFO(ifp);
9274 	if (NULL != ndi) {
9275 		ndi->cga_initialized = FALSE;
9276 	}
9277 
9278 	/* Mark the interface down */
9279 	if_down(ifp);
9280 
9281 	/*
9282 	 * IMPORTANT NOTE
9283 	 *
9284 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9285 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9286 	 * until after we've waited for all I/O references to drain
9287 	 * in ifnet_detach_final().
9288 	 */
9289 
9290 	ifnet_head_lock_exclusive();
9291 	ifnet_lock_exclusive(ifp);
9292 
9293 	if (ifp->if_output_netem != NULL) {
9294 		netem_destroy(ifp->if_output_netem);
9295 		ifp->if_output_netem = NULL;
9296 	}
9297 
9298 	/*
9299 	 * Check to see if this interface has previously triggered
9300 	 * aggressive protocol draining; if so, decrement the global
9301 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9302 	 * there are no more of such an interface around.
9303 	 */
9304 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9305 
9306 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9307 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9308 		lck_mtx_unlock(&ifp->if_ref_lock);
9309 		ifnet_lock_done(ifp);
9310 		ifnet_head_done();
9311 		return EINVAL;
9312 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9313 		/* Interface has already been detached */
9314 		lck_mtx_unlock(&ifp->if_ref_lock);
9315 		ifnet_lock_done(ifp);
9316 		ifnet_head_done();
9317 		return ENXIO;
9318 	}
9319 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9320 	/* Indicate this interface is being detached */
9321 	ifp->if_refflags &= ~IFRF_ATTACHED;
9322 	ifp->if_refflags |= IFRF_DETACHING;
9323 	lck_mtx_unlock(&ifp->if_ref_lock);
9324 
9325 	if (dlil_verbose) {
9326 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9327 	}
9328 
9329 	/* clean up flow control entry object if there's any */
9330 	if (ifp->if_eflags & IFEF_TXSTART) {
9331 		ifnet_flowadv(ifp->if_flowhash);
9332 	}
9333 
9334 	/* Reset ECN enable/disable flags */
9335 	/* Reset CLAT46 flag */
9336 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9337 
9338 	/*
9339 	 * We do not reset the TCP keep alive counters in case
9340 	 * a TCP connection stays connection after the interface
9341 	 * went down
9342 	 */
9343 	if (ifp->if_tcp_kao_cnt > 0) {
9344 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9345 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9346 	}
9347 	ifp->if_tcp_kao_max = 0;
9348 
9349 	/*
9350 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9351 	 * no longer be visible during lookups from this point.
9352 	 */
9353 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9354 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9355 	ifp->if_link.tqe_next = NULL;
9356 	ifp->if_link.tqe_prev = NULL;
9357 	if (ifp->if_ordered_link.tqe_next != NULL ||
9358 	    ifp->if_ordered_link.tqe_prev != NULL) {
9359 		ifnet_remove_from_ordered_list(ifp);
9360 	}
9361 	ifindex2ifnet[ifp->if_index] = NULL;
9362 
9363 	/* 18717626 - reset router mode */
9364 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9365 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9366 
9367 	/* Record detach PC stacktrace */
9368 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9369 
9370 	/* Clear logging parameters */
9371 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9372 
9373 	/* Clear delegated interface info (reference released below) */
9374 	delegated_ifp = ifp->if_delegated.ifp;
9375 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9376 
9377 	/* Reset interface state */
9378 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9379 
9380 	ifnet_lock_done(ifp);
9381 	ifnet_head_done();
9382 
9383 	/* Release reference held on the delegated interface */
9384 	if (delegated_ifp != NULL) {
9385 		ifnet_release(delegated_ifp);
9386 	}
9387 
9388 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9389 	if (ifp != lo_ifp) {
9390 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9391 	}
9392 
9393 	/* Reset TCP local statistics */
9394 	if (ifp->if_tcp_stat != NULL) {
9395 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9396 	}
9397 
9398 	/* Reset UDP local statistics */
9399 	if (ifp->if_udp_stat != NULL) {
9400 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9401 	}
9402 
9403 	/* Reset ifnet IPv4 stats */
9404 	if (ifp->if_ipv4_stat != NULL) {
9405 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9406 	}
9407 
9408 	/* Reset ifnet IPv6 stats */
9409 	if (ifp->if_ipv6_stat != NULL) {
9410 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9411 	}
9412 
9413 	/* Release memory held for interface link status report */
9414 	if (ifp->if_link_status != NULL) {
9415 		kfree_type(struct if_link_status, ifp->if_link_status);
9416 		ifp->if_link_status = NULL;
9417 	}
9418 
9419 	/* Let BPF know we're detaching */
9420 	bpfdetach(ifp);
9421 
9422 	/* Disable forwarding cached route */
9423 	lck_mtx_lock(&ifp->if_cached_route_lock);
9424 	ifp->if_fwd_cacheok = 0;
9425 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9426 
9427 	/* Disable data threshold and wait for any pending event posting */
9428 	ifp->if_data_threshold = 0;
9429 	VERIFY(ifp->if_dt_tcall != NULL);
9430 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9431 
9432 	/*
9433 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9434 	 * references to the info structures and leave them attached to
9435 	 * this ifnet.
9436 	 */
9437 #if INET
9438 	igmp_domifdetach(ifp);
9439 #endif /* INET */
9440 	mld_domifdetach(ifp);
9441 
9442 #if SKYWALK
9443 	/* Clean up any netns tokens still pointing to to this ifnet */
9444 	netns_ifnet_detach(ifp);
9445 #endif /* SKYWALK */
9446 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9447 
9448 	/* Let worker thread take care of the rest, to avoid reentrancy */
9449 	dlil_if_lock();
9450 	ifnet_detaching_enqueue(ifp);
9451 	dlil_if_unlock();
9452 
9453 	return 0;
9454 }
9455 
9456 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9457 ifnet_detaching_enqueue(struct ifnet *ifp)
9458 {
9459 	dlil_if_lock_assert();
9460 
9461 	++ifnet_detaching_cnt;
9462 	VERIFY(ifnet_detaching_cnt != 0);
9463 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9464 	wakeup((caddr_t)&ifnet_delayed_run);
9465 }
9466 
9467 static struct ifnet *
ifnet_detaching_dequeue(void)9468 ifnet_detaching_dequeue(void)
9469 {
9470 	struct ifnet *ifp;
9471 
9472 	dlil_if_lock_assert();
9473 
9474 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9475 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9476 	if (ifp != NULL) {
9477 		VERIFY(ifnet_detaching_cnt != 0);
9478 		--ifnet_detaching_cnt;
9479 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9480 		ifp->if_detaching_link.tqe_next = NULL;
9481 		ifp->if_detaching_link.tqe_prev = NULL;
9482 	}
9483 	return ifp;
9484 }
9485 
9486 __attribute__((noreturn))
9487 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9488 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9489 {
9490 #pragma unused(v, wres)
9491 	struct ifnet *ifp;
9492 
9493 	dlil_if_lock();
9494 	if (__improbable(ifnet_detaching_embryonic)) {
9495 		ifnet_detaching_embryonic = FALSE;
9496 		/* there's no lock ordering constrain so OK to do this here */
9497 		dlil_decr_pending_thread_count();
9498 	}
9499 
9500 	for (;;) {
9501 		dlil_if_lock_assert();
9502 
9503 		if (ifnet_detaching_cnt == 0) {
9504 			break;
9505 		}
9506 
9507 		net_update_uptime();
9508 
9509 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9510 
9511 		/* Take care of detaching ifnet */
9512 		ifp = ifnet_detaching_dequeue();
9513 		if (ifp != NULL) {
9514 			dlil_if_unlock();
9515 			ifnet_detach_final(ifp);
9516 			dlil_if_lock();
9517 		}
9518 	}
9519 
9520 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9521 	dlil_if_unlock();
9522 	(void) thread_block(ifnet_detacher_thread_cont);
9523 
9524 	VERIFY(0);      /* we should never get here */
9525 	/* NOTREACHED */
9526 	__builtin_unreachable();
9527 }
9528 
9529 __dead2
9530 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9531 ifnet_detacher_thread_func(void *v, wait_result_t w)
9532 {
9533 #pragma unused(v, w)
9534 	dlil_if_lock();
9535 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9536 	ifnet_detaching_embryonic = TRUE;
9537 	/* wake up once to get out of embryonic state */
9538 	wakeup((caddr_t)&ifnet_delayed_run);
9539 	dlil_if_unlock();
9540 	(void) thread_block(ifnet_detacher_thread_cont);
9541 	VERIFY(0);
9542 	/* NOTREACHED */
9543 	__builtin_unreachable();
9544 }
9545 
9546 static void
ifnet_detach_final(struct ifnet * ifp)9547 ifnet_detach_final(struct ifnet *ifp)
9548 {
9549 	struct ifnet_filter *filter, *filter_next;
9550 	struct dlil_ifnet *dlifp;
9551 	struct ifnet_filter_head fhead;
9552 	struct dlil_threading_info *inp;
9553 	struct ifaddr *ifa;
9554 	ifnet_detached_func if_free;
9555 	int i;
9556 
9557 #if SKYWALK
9558 	dlil_netif_detach_notify(ifp);
9559 	/*
9560 	 * Wait for the datapath to quiesce before tearing down
9561 	 * netif/flowswitch nexuses.
9562 	 */
9563 	dlil_quiesce_and_detach_nexuses(ifp);
9564 #endif /* SKYWALK */
9565 
9566 	lck_mtx_lock(&ifp->if_ref_lock);
9567 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9568 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9569 		    __func__, ifp);
9570 		/* NOTREACHED */
9571 	}
9572 
9573 	/*
9574 	 * Wait until the existing IO references get released
9575 	 * before we proceed with ifnet_detach.  This is not a
9576 	 * common case, so block without using a continuation.
9577 	 */
9578 	while (ifp->if_refio > 0) {
9579 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9580 		    "to be released\n", __func__, if_name(ifp));
9581 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9582 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9583 	}
9584 
9585 	VERIFY(ifp->if_datamov == 0);
9586 	VERIFY(ifp->if_drainers == 0);
9587 	VERIFY(ifp->if_suspend == 0);
9588 	ifp->if_refflags &= ~IFRF_READY;
9589 	lck_mtx_unlock(&ifp->if_ref_lock);
9590 
9591 	/* Clear agent IDs */
9592 	if (ifp->if_agentids != NULL) {
9593 		kfree_data(ifp->if_agentids,
9594 		    sizeof(uuid_t) * ifp->if_agentcount);
9595 		ifp->if_agentids = NULL;
9596 	}
9597 	ifp->if_agentcount = 0;
9598 
9599 #if SKYWALK
9600 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9601 #endif /* SKYWALK */
9602 	/* Drain and destroy send queue */
9603 	ifclassq_teardown(ifp->if_snd);
9604 
9605 	/* Detach interface filters */
9606 	lck_mtx_lock(&ifp->if_flt_lock);
9607 	if_flt_monitor_enter(ifp);
9608 
9609 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9610 	fhead = ifp->if_flt_head;
9611 	TAILQ_INIT(&ifp->if_flt_head);
9612 
9613 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9614 		filter_next = TAILQ_NEXT(filter, filt_next);
9615 		lck_mtx_unlock(&ifp->if_flt_lock);
9616 
9617 		dlil_detach_filter_internal(filter, 1);
9618 		lck_mtx_lock(&ifp->if_flt_lock);
9619 	}
9620 	if_flt_monitor_leave(ifp);
9621 	lck_mtx_unlock(&ifp->if_flt_lock);
9622 
9623 	/* Tell upper layers to drop their network addresses */
9624 	if_purgeaddrs(ifp);
9625 
9626 	ifnet_lock_exclusive(ifp);
9627 
9628 	/* Unplumb all protocols */
9629 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9630 		struct if_proto *proto;
9631 
9632 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9633 		while (proto != NULL) {
9634 			protocol_family_t family = proto->protocol_family;
9635 			ifnet_lock_done(ifp);
9636 			proto_unplumb(family, ifp);
9637 			ifnet_lock_exclusive(ifp);
9638 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9639 		}
9640 		/* There should not be any protocols left */
9641 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9642 	}
9643 	zfree(dlif_phash_zone, ifp->if_proto_hash);
9644 	ifp->if_proto_hash = NULL;
9645 
9646 	/* Detach (permanent) link address from if_addrhead */
9647 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9648 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9649 	IFA_LOCK(ifa);
9650 	if_detach_link_ifa(ifp, ifa);
9651 	IFA_UNLOCK(ifa);
9652 
9653 	/* Remove (permanent) link address from ifnet_addrs[] */
9654 	IFA_REMREF(ifa);
9655 	ifnet_addrs[ifp->if_index - 1] = NULL;
9656 
9657 	/* This interface should not be on {ifnet_head,detaching} */
9658 	VERIFY(ifp->if_link.tqe_next == NULL);
9659 	VERIFY(ifp->if_link.tqe_prev == NULL);
9660 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9661 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9662 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9663 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9664 
9665 	/* The slot should have been emptied */
9666 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9667 
9668 	/* There should not be any addresses left */
9669 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9670 
9671 	/*
9672 	 * Signal the starter thread to terminate itself, and wait until
9673 	 * it has exited.
9674 	 */
9675 	if (ifp->if_start_thread != THREAD_NULL) {
9676 		lck_mtx_lock_spin(&ifp->if_start_lock);
9677 		ifp->if_start_flags |= IFSF_TERMINATING;
9678 		wakeup_one((caddr_t)&ifp->if_start_thread);
9679 		lck_mtx_unlock(&ifp->if_start_lock);
9680 
9681 		/* wait for starter thread to terminate */
9682 		lck_mtx_lock(&ifp->if_start_lock);
9683 		while (ifp->if_start_thread != THREAD_NULL) {
9684 			if (dlil_verbose) {
9685 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9686 				    __func__,
9687 				    if_name(ifp));
9688 			}
9689 			(void) msleep(&ifp->if_start_thread,
9690 			    &ifp->if_start_lock, (PZERO - 1),
9691 			    "ifnet_start_thread_exit", NULL);
9692 		}
9693 		lck_mtx_unlock(&ifp->if_start_lock);
9694 		if (dlil_verbose) {
9695 			DLIL_PRINTF("%s: %s starter thread termination complete",
9696 			    __func__, if_name(ifp));
9697 		}
9698 	}
9699 
9700 	/*
9701 	 * Signal the poller thread to terminate itself, and wait until
9702 	 * it has exited.
9703 	 */
9704 	if (ifp->if_poll_thread != THREAD_NULL) {
9705 #if SKYWALK
9706 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9707 #endif /* SKYWALK */
9708 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9709 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9710 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9711 		lck_mtx_unlock(&ifp->if_poll_lock);
9712 
9713 		/* wait for poller thread to terminate */
9714 		lck_mtx_lock(&ifp->if_poll_lock);
9715 		while (ifp->if_poll_thread != THREAD_NULL) {
9716 			if (dlil_verbose) {
9717 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9718 				    __func__,
9719 				    if_name(ifp));
9720 			}
9721 			(void) msleep(&ifp->if_poll_thread,
9722 			    &ifp->if_poll_lock, (PZERO - 1),
9723 			    "ifnet_poll_thread_exit", NULL);
9724 		}
9725 		lck_mtx_unlock(&ifp->if_poll_lock);
9726 		if (dlil_verbose) {
9727 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9728 			    __func__, if_name(ifp));
9729 		}
9730 	}
9731 
9732 	/*
9733 	 * If thread affinity was set for the workloop thread, we will need
9734 	 * to tear down the affinity and release the extra reference count
9735 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9736 	 * without dedicated input threads.
9737 	 */
9738 	if ((inp = ifp->if_inp) != NULL) {
9739 		VERIFY(inp != dlil_main_input_thread);
9740 
9741 		if (inp->dlth_affinity) {
9742 			struct thread *tp, *wtp, *ptp;
9743 
9744 			lck_mtx_lock_spin(&inp->dlth_lock);
9745 			wtp = inp->dlth_driver_thread;
9746 			inp->dlth_driver_thread = THREAD_NULL;
9747 			ptp = inp->dlth_poller_thread;
9748 			inp->dlth_poller_thread = THREAD_NULL;
9749 			ASSERT(inp->dlth_thread != THREAD_NULL);
9750 			tp = inp->dlth_thread;    /* don't nullify now */
9751 			inp->dlth_affinity_tag = 0;
9752 			inp->dlth_affinity = FALSE;
9753 			lck_mtx_unlock(&inp->dlth_lock);
9754 
9755 			/* Tear down poll thread affinity */
9756 			if (ptp != NULL) {
9757 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9758 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9759 				(void) dlil_affinity_set(ptp,
9760 				    THREAD_AFFINITY_TAG_NULL);
9761 				thread_deallocate(ptp);
9762 			}
9763 
9764 			/* Tear down workloop thread affinity */
9765 			if (wtp != NULL) {
9766 				(void) dlil_affinity_set(wtp,
9767 				    THREAD_AFFINITY_TAG_NULL);
9768 				thread_deallocate(wtp);
9769 			}
9770 
9771 			/* Tear down DLIL input thread affinity */
9772 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9773 			thread_deallocate(tp);
9774 		}
9775 
9776 		/* disassociate ifp DLIL input thread */
9777 		ifp->if_inp = NULL;
9778 
9779 		/* if the worker thread was created, tell it to terminate */
9780 		if (inp->dlth_thread != THREAD_NULL) {
9781 			lck_mtx_lock_spin(&inp->dlth_lock);
9782 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9783 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9784 				wakeup_one((caddr_t)&inp->dlth_flags);
9785 			}
9786 			lck_mtx_unlock(&inp->dlth_lock);
9787 			ifnet_lock_done(ifp);
9788 
9789 			/* wait for the input thread to terminate */
9790 			lck_mtx_lock_spin(&inp->dlth_lock);
9791 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9792 			    == 0) {
9793 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9794 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9795 			}
9796 			lck_mtx_unlock(&inp->dlth_lock);
9797 			ifnet_lock_exclusive(ifp);
9798 		}
9799 
9800 		/* clean-up input thread state */
9801 		dlil_clean_threading_info(inp);
9802 		/* clean-up poll parameters */
9803 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9804 		dlil_reset_rxpoll_params(ifp);
9805 	}
9806 
9807 	/* The driver might unload, so point these to ourselves */
9808 	if_free = ifp->if_free;
9809 	ifp->if_output_dlil = ifp_if_output;
9810 	ifp->if_output = ifp_if_output;
9811 	ifp->if_pre_enqueue = ifp_if_output;
9812 	ifp->if_start = ifp_if_start;
9813 	ifp->if_output_ctl = ifp_if_ctl;
9814 	ifp->if_input_dlil = ifp_if_input;
9815 	ifp->if_input_poll = ifp_if_input_poll;
9816 	ifp->if_input_ctl = ifp_if_ctl;
9817 	ifp->if_ioctl = ifp_if_ioctl;
9818 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9819 	ifp->if_free = ifp_if_free;
9820 	ifp->if_demux = ifp_if_demux;
9821 	ifp->if_event = ifp_if_event;
9822 	ifp->if_framer_legacy = ifp_if_framer;
9823 	ifp->if_framer = ifp_if_framer_extended;
9824 	ifp->if_add_proto = ifp_if_add_proto;
9825 	ifp->if_del_proto = ifp_if_del_proto;
9826 	ifp->if_check_multi = ifp_if_check_multi;
9827 
9828 	/* wipe out interface description */
9829 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9830 	ifp->if_desc.ifd_len = 0;
9831 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9832 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9833 
9834 	/* there shouldn't be any delegation by now */
9835 	VERIFY(ifp->if_delegated.ifp == NULL);
9836 	VERIFY(ifp->if_delegated.type == 0);
9837 	VERIFY(ifp->if_delegated.family == 0);
9838 	VERIFY(ifp->if_delegated.subfamily == 0);
9839 	VERIFY(ifp->if_delegated.expensive == 0);
9840 	VERIFY(ifp->if_delegated.constrained == 0);
9841 
9842 	/* QoS marking get cleared */
9843 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9844 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9845 
9846 #if SKYWALK
9847 	/* the nexus destructor is responsible for clearing these */
9848 	VERIFY(ifp->if_na_ops == NULL);
9849 	VERIFY(ifp->if_na == NULL);
9850 #endif /* SKYWALK */
9851 
9852 	/* promiscuous count needs to start at zero again */
9853 	ifp->if_pcount = 0;
9854 	ifp->if_flags &= ~IFF_PROMISC;
9855 
9856 	ifnet_lock_done(ifp);
9857 
9858 #if PF
9859 	/*
9860 	 * Detach this interface from packet filter, if enabled.
9861 	 */
9862 	pf_ifnet_hook(ifp, 0);
9863 #endif /* PF */
9864 
9865 	/* Filter list should be empty */
9866 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9867 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9868 	VERIFY(ifp->if_flt_busy == 0);
9869 	VERIFY(ifp->if_flt_waiters == 0);
9870 	VERIFY(ifp->if_flt_non_os_count == 0);
9871 	VERIFY(ifp->if_flt_no_tso_count == 0);
9872 	lck_mtx_unlock(&ifp->if_flt_lock);
9873 
9874 	/* Last chance to drain send queue */
9875 	if_qflush_snd(ifp, 0);
9876 
9877 	/* Last chance to cleanup any cached route */
9878 	lck_mtx_lock(&ifp->if_cached_route_lock);
9879 	VERIFY(!ifp->if_fwd_cacheok);
9880 	ROUTE_RELEASE(&ifp->if_fwd_route);
9881 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9882 	ROUTE_RELEASE(&ifp->if_src_route);
9883 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9884 	ROUTE_RELEASE(&ifp->if_src_route6);
9885 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9886 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9887 
9888 	VERIFY(ifp->if_data_threshold == 0);
9889 	VERIFY(ifp->if_dt_tcall != NULL);
9890 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9891 
9892 	ifnet_llreach_ifdetach(ifp);
9893 
9894 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9895 
9896 	/*
9897 	 * Finally, mark this ifnet as detached.
9898 	 */
9899 	if (dlil_verbose) {
9900 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9901 	}
9902 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9903 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9904 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9905 		    __func__, ifp);
9906 		/* NOTREACHED */
9907 	}
9908 	ifp->if_refflags &= ~IFRF_DETACHING;
9909 	lck_mtx_unlock(&ifp->if_ref_lock);
9910 	if (if_free != NULL) {
9911 		if_free(ifp);
9912 	}
9913 
9914 	ifclassq_release(&ifp->if_snd);
9915 
9916 	/* we're fully detached, clear the "in use" bit */
9917 	dlifp = (struct dlil_ifnet *)ifp;
9918 	lck_mtx_lock(&dlifp->dl_if_lock);
9919 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9920 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9921 	lck_mtx_unlock(&dlifp->dl_if_lock);
9922 
9923 	/* Release reference held during ifnet attach */
9924 	ifnet_release(ifp);
9925 }
9926 
9927 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9928 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9929 {
9930 #pragma unused(ifp)
9931 	m_freem_list(m);
9932 	return 0;
9933 }
9934 
9935 void
ifp_if_start(struct ifnet * ifp)9936 ifp_if_start(struct ifnet *ifp)
9937 {
9938 	ifnet_purge(ifp);
9939 }
9940 
9941 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9942 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9943     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9944     boolean_t poll, struct thread *tp)
9945 {
9946 #pragma unused(ifp, m_tail, s, poll, tp)
9947 	m_freem_list(m_head);
9948 	return ENXIO;
9949 }
9950 
9951 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9952 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9953     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9954 {
9955 #pragma unused(ifp, flags, max_cnt)
9956 	if (m_head != NULL) {
9957 		*m_head = NULL;
9958 	}
9959 	if (m_tail != NULL) {
9960 		*m_tail = NULL;
9961 	}
9962 	if (cnt != NULL) {
9963 		*cnt = 0;
9964 	}
9965 	if (len != NULL) {
9966 		*len = 0;
9967 	}
9968 }
9969 
9970 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9971 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9972 {
9973 #pragma unused(ifp, cmd, arglen, arg)
9974 	return EOPNOTSUPP;
9975 }
9976 
9977 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9978 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9979 {
9980 #pragma unused(ifp, fh, pf)
9981 	m_freem(m);
9982 	return EJUSTRETURN;
9983 }
9984 
9985 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9986 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9987     const struct ifnet_demux_desc *da, u_int32_t dc)
9988 {
9989 #pragma unused(ifp, pf, da, dc)
9990 	return EINVAL;
9991 }
9992 
9993 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9994 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9995 {
9996 #pragma unused(ifp, pf)
9997 	return EINVAL;
9998 }
9999 
10000 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10001 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10002 {
10003 #pragma unused(ifp, sa)
10004 	return EOPNOTSUPP;
10005 }
10006 
10007 #if !XNU_TARGET_OS_OSX
10008 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10009 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10010     const struct sockaddr *sa, const char *ll, const char *t,
10011     u_int32_t *pre, u_int32_t *post)
10012 #else /* XNU_TARGET_OS_OSX */
10013 static errno_t
10014 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10015     const struct sockaddr *sa, const char *ll, const char *t)
10016 #endif /* XNU_TARGET_OS_OSX */
10017 {
10018 #pragma unused(ifp, m, sa, ll, t)
10019 #if !XNU_TARGET_OS_OSX
10020 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10021 #else /* XNU_TARGET_OS_OSX */
10022 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10023 #endif /* XNU_TARGET_OS_OSX */
10024 }
10025 
10026 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10027 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10028     const struct sockaddr *sa, const char *ll, const char *t,
10029     u_int32_t *pre, u_int32_t *post)
10030 {
10031 #pragma unused(ifp, sa, ll, t)
10032 	m_freem(*m);
10033 	*m = NULL;
10034 
10035 	if (pre != NULL) {
10036 		*pre = 0;
10037 	}
10038 	if (post != NULL) {
10039 		*post = 0;
10040 	}
10041 
10042 	return EJUSTRETURN;
10043 }
10044 
10045 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10046 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10047 {
10048 #pragma unused(ifp, cmd, arg)
10049 	return EOPNOTSUPP;
10050 }
10051 
10052 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10053 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10054 {
10055 #pragma unused(ifp, tm, f)
10056 	/* XXX not sure what to do here */
10057 	return 0;
10058 }
10059 
10060 static void
ifp_if_free(struct ifnet * ifp)10061 ifp_if_free(struct ifnet *ifp)
10062 {
10063 #pragma unused(ifp)
10064 }
10065 
10066 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10067 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10068 {
10069 #pragma unused(ifp, e)
10070 }
10071 
10072 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10073 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10074     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10075 {
10076 	struct ifnet *ifp1 = NULL;
10077 	struct dlil_ifnet *dlifp1 = NULL;
10078 	struct dlil_ifnet *dlifp1_saved = NULL;
10079 	void *buf, *base, **pbuf;
10080 	int ret = 0;
10081 
10082 	VERIFY(*ifp == NULL);
10083 	dlil_if_lock();
10084 	/*
10085 	 * We absolutely can't have an interface with the same name
10086 	 * in in-use state.
10087 	 * To make sure of that list has to be traversed completely
10088 	 */
10089 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10090 		ifp1 = (struct ifnet *)dlifp1;
10091 
10092 		if (ifp1->if_family != family) {
10093 			continue;
10094 		}
10095 
10096 		/*
10097 		 * If interface is in use, return EBUSY if either unique id
10098 		 * or interface extended names are the same
10099 		 */
10100 		lck_mtx_lock(&dlifp1->dl_if_lock);
10101 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10102 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10103 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10104 			ret = EBUSY;
10105 			goto end;
10106 		}
10107 
10108 		if (uniqueid_len != 0 &&
10109 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10110 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10111 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10112 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10113 				ret = EBUSY;
10114 				goto end;
10115 			}
10116 			if (dlifp1_saved == NULL) {
10117 				/* cache the first match */
10118 				dlifp1_saved = dlifp1;
10119 			}
10120 			/*
10121 			 * Do not break or jump to end as we have to traverse
10122 			 * the whole list to ensure there are no name collisions
10123 			 */
10124 		}
10125 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10126 	}
10127 
10128 	/* If there's an interface that can be recycled, use that */
10129 	if (dlifp1_saved != NULL) {
10130 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10131 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10132 			/* some other thread got in ahead of us */
10133 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10134 			ret = EBUSY;
10135 			goto end;
10136 		}
10137 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10138 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10139 		*ifp = (struct ifnet *)dlifp1_saved;
10140 		dlil_if_ref(*ifp);
10141 		goto end;
10142 	}
10143 
10144 	/* no interface found, allocate a new one */
10145 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10146 
10147 	/* Get the 64-bit aligned base address for this object */
10148 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10149 	    sizeof(u_int64_t));
10150 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10151 
10152 	/*
10153 	 * Wind back a pointer size from the aligned base and
10154 	 * save the original address so we can free it later.
10155 	 */
10156 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10157 	*pbuf = buf;
10158 	dlifp1 = base;
10159 
10160 	if (uniqueid_len) {
10161 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10162 		    Z_WAITOK);
10163 		if (dlifp1->dl_if_uniqueid == NULL) {
10164 			zfree(dlif_zone, buf);
10165 			ret = ENOMEM;
10166 			goto end;
10167 		}
10168 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10169 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10170 	}
10171 
10172 	ifp1 = (struct ifnet *)dlifp1;
10173 	dlifp1->dl_if_flags = DLIF_INUSE;
10174 	if (ifnet_debug) {
10175 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10176 		dlifp1->dl_if_trace = dlil_if_trace;
10177 	}
10178 	ifp1->if_name = dlifp1->dl_if_namestorage;
10179 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10180 
10181 	/* initialize interface description */
10182 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10183 	ifp1->if_desc.ifd_len = 0;
10184 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10185 
10186 #if SKYWALK
10187 	SLIST_INIT(&ifp1->if_netns_tokens);
10188 #endif /* SKYWALK */
10189 
10190 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10191 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10192 		    "error: %d\n", __func__, ret);
10193 		/* This probably shouldn't be fatal */
10194 		ret = 0;
10195 	}
10196 
10197 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10198 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10199 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10200 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10202 	    &ifnet_lock_attr);
10203 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10204 #if INET
10205 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10206 	    &ifnet_lock_attr);
10207 	ifp1->if_inetdata = NULL;
10208 #endif
10209 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10210 	ifp1->if_inet6_ioctl_busy = FALSE;
10211 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10212 	    &ifnet_lock_attr);
10213 	ifp1->if_inet6data = NULL;
10214 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10215 	    &ifnet_lock_attr);
10216 	ifp1->if_link_status = NULL;
10217 
10218 	/* for send data paths */
10219 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10220 	    &ifnet_lock_attr);
10221 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10222 	    &ifnet_lock_attr);
10223 
10224 	/* for receive data paths */
10225 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10226 	    &ifnet_lock_attr);
10227 
10228 	/* thread call allocation is done with sleeping zalloc */
10229 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10230 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10231 	if (ifp1->if_dt_tcall == NULL) {
10232 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10233 		/* NOTREACHED */
10234 	}
10235 
10236 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10237 
10238 	*ifp = ifp1;
10239 	dlil_if_ref(*ifp);
10240 
10241 end:
10242 	dlil_if_unlock();
10243 
10244 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10245 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10246 
10247 	return ret;
10248 }
10249 
10250 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10251 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10252 {
10253 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10254 
10255 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10256 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10257 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10258 	}
10259 
10260 	ifnet_lock_exclusive(ifp);
10261 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10262 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10263 		ifp->if_broadcast.length = 0;
10264 		ifp->if_broadcast.u.ptr = NULL;
10265 	}
10266 	lck_mtx_lock(&dlifp->dl_if_lock);
10267 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10268 	ifp->if_name = dlifp->dl_if_namestorage;
10269 	/* Reset external name (name + unit) */
10270 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10271 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10272 	    "%s?", ifp->if_name);
10273 	if (clear_in_use) {
10274 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10275 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10276 	}
10277 	lck_mtx_unlock(&dlifp->dl_if_lock);
10278 	ifnet_lock_done(ifp);
10279 }
10280 
10281 __private_extern__ void
dlil_if_release(ifnet_t ifp)10282 dlil_if_release(ifnet_t ifp)
10283 {
10284 	_dlil_if_release(ifp, false);
10285 }
10286 
10287 __private_extern__ void
dlil_if_lock(void)10288 dlil_if_lock(void)
10289 {
10290 	lck_mtx_lock(&dlil_ifnet_lock);
10291 }
10292 
10293 __private_extern__ void
dlil_if_unlock(void)10294 dlil_if_unlock(void)
10295 {
10296 	lck_mtx_unlock(&dlil_ifnet_lock);
10297 }
10298 
10299 __private_extern__ void
dlil_if_lock_assert(void)10300 dlil_if_lock_assert(void)
10301 {
10302 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10303 }
10304 
10305 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10306 dlil_proto_unplumb_all(struct ifnet *ifp)
10307 {
10308 	/*
10309 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10310 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10311 	 * explicit unplumb.
10312 	 *
10313 	 * if_proto_hash[3] is for other protocols; we expect anything
10314 	 * in this bucket to respond to the DETACHING event (which would
10315 	 * have happened by now) and do the unplumb then.
10316 	 */
10317 	(void) proto_unplumb(PF_INET, ifp);
10318 	(void) proto_unplumb(PF_INET6, ifp);
10319 }
10320 
10321 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10322 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10323 {
10324 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10325 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10326 
10327 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10328 
10329 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10330 }
10331 
10332 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10333 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10334 {
10335 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10336 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10337 
10338 	if (ifp->if_fwd_cacheok) {
10339 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10340 	} else {
10341 		ROUTE_RELEASE(src);
10342 	}
10343 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10344 }
10345 
10346 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10347 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10348 {
10349 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10350 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10351 
10352 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10353 	    sizeof(*dst));
10354 
10355 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10356 }
10357 
10358 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10359 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10360 {
10361 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10362 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10363 
10364 	if (ifp->if_fwd_cacheok) {
10365 		route_copyin((struct route *)src,
10366 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10367 	} else {
10368 		ROUTE_RELEASE(src);
10369 	}
10370 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10371 }
10372 
10373 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10374 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10375 {
10376 	struct route            src_rt;
10377 	struct sockaddr_in      *dst;
10378 
10379 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10380 
10381 	ifp_src_route_copyout(ifp, &src_rt);
10382 
10383 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10384 		ROUTE_RELEASE(&src_rt);
10385 		if (dst->sin_family != AF_INET) {
10386 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10387 			dst->sin_len = sizeof(src_rt.ro_dst);
10388 			dst->sin_family = AF_INET;
10389 		}
10390 		dst->sin_addr = src_ip;
10391 
10392 		VERIFY(src_rt.ro_rt == NULL);
10393 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10394 		    0, 0, ifp->if_index);
10395 
10396 		if (src_rt.ro_rt != NULL) {
10397 			/* retain a ref, copyin consumes one */
10398 			struct rtentry  *rte = src_rt.ro_rt;
10399 			RT_ADDREF(rte);
10400 			ifp_src_route_copyin(ifp, &src_rt);
10401 			src_rt.ro_rt = rte;
10402 		}
10403 	}
10404 
10405 	return src_rt.ro_rt;
10406 }
10407 
10408 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10409 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10410 {
10411 	struct route_in6 src_rt;
10412 
10413 	ifp_src_route6_copyout(ifp, &src_rt);
10414 
10415 	if (ROUTE_UNUSABLE(&src_rt) ||
10416 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10417 		ROUTE_RELEASE(&src_rt);
10418 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10419 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10420 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10421 			src_rt.ro_dst.sin6_family = AF_INET6;
10422 		}
10423 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10424 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10425 		    sizeof(src_rt.ro_dst.sin6_addr));
10426 
10427 		if (src_rt.ro_rt == NULL) {
10428 			src_rt.ro_rt = rtalloc1_scoped(
10429 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10430 				ifp->if_index);
10431 
10432 			if (src_rt.ro_rt != NULL) {
10433 				/* retain a ref, copyin consumes one */
10434 				struct rtentry  *rte = src_rt.ro_rt;
10435 				RT_ADDREF(rte);
10436 				ifp_src_route6_copyin(ifp, &src_rt);
10437 				src_rt.ro_rt = rte;
10438 			}
10439 		}
10440 	}
10441 
10442 	return src_rt.ro_rt;
10443 }
10444 
10445 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10446 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10447 {
10448 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10449 
10450 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10451 
10452 	/* Normalize to edge */
10453 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10454 		lqm = IFNET_LQM_THRESH_ABORT;
10455 		atomic_bitset_32(&tcbinfo.ipi_flags,
10456 		    INPCBINFO_HANDLE_LQM_ABORT);
10457 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10458 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10459 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10460 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10461 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10462 	    lqm <= IFNET_LQM_THRESH_POOR) {
10463 		lqm = IFNET_LQM_THRESH_POOR;
10464 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10465 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10466 		lqm = IFNET_LQM_THRESH_GOOD;
10467 	}
10468 
10469 	/*
10470 	 * Take the lock if needed
10471 	 */
10472 	if (!locked) {
10473 		ifnet_lock_exclusive(ifp);
10474 	}
10475 
10476 	if (lqm == ifp->if_interface_state.lqm_state &&
10477 	    (ifp->if_interface_state.valid_bitmask &
10478 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10479 		/*
10480 		 * Release the lock if was not held by the caller
10481 		 */
10482 		if (!locked) {
10483 			ifnet_lock_done(ifp);
10484 		}
10485 		return;         /* nothing to update */
10486 	}
10487 	ifp->if_interface_state.valid_bitmask |=
10488 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10489 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10490 
10491 	/*
10492 	 * Don't want to hold the lock when issuing kernel events
10493 	 */
10494 	ifnet_lock_done(ifp);
10495 
10496 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10497 	ev_lqm_data.link_quality_metric = lqm;
10498 
10499 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10500 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10501 
10502 	/*
10503 	 * Reacquire the lock for the caller
10504 	 */
10505 	if (locked) {
10506 		ifnet_lock_exclusive(ifp);
10507 	}
10508 }
10509 
10510 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10511 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10512 {
10513 	struct kev_dl_rrc_state kev;
10514 
10515 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10516 	    (ifp->if_interface_state.valid_bitmask &
10517 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10518 		return;
10519 	}
10520 
10521 	ifp->if_interface_state.valid_bitmask |=
10522 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10523 
10524 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10525 
10526 	/*
10527 	 * Don't want to hold the lock when issuing kernel events
10528 	 */
10529 	ifnet_lock_done(ifp);
10530 
10531 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10532 	kev.rrc_state = rrc_state;
10533 
10534 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10535 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10536 
10537 	ifnet_lock_exclusive(ifp);
10538 }
10539 
10540 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10541 if_state_update(struct ifnet *ifp,
10542     struct if_interface_state *if_interface_state)
10543 {
10544 	u_short if_index_available = 0;
10545 
10546 	ifnet_lock_exclusive(ifp);
10547 
10548 	if ((ifp->if_type != IFT_CELLULAR) &&
10549 	    (if_interface_state->valid_bitmask &
10550 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10551 		ifnet_lock_done(ifp);
10552 		return ENOTSUP;
10553 	}
10554 	if ((if_interface_state->valid_bitmask &
10555 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10556 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10557 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10558 		ifnet_lock_done(ifp);
10559 		return EINVAL;
10560 	}
10561 	if ((if_interface_state->valid_bitmask &
10562 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10563 	    if_interface_state->rrc_state !=
10564 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10565 	    if_interface_state->rrc_state !=
10566 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10567 		ifnet_lock_done(ifp);
10568 		return EINVAL;
10569 	}
10570 
10571 	if (if_interface_state->valid_bitmask &
10572 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10573 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10574 	}
10575 	if (if_interface_state->valid_bitmask &
10576 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10577 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10578 	}
10579 	if (if_interface_state->valid_bitmask &
10580 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10581 		ifp->if_interface_state.valid_bitmask |=
10582 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10583 		ifp->if_interface_state.interface_availability =
10584 		    if_interface_state->interface_availability;
10585 
10586 		if (ifp->if_interface_state.interface_availability ==
10587 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10588 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10589 			    __func__, if_name(ifp), ifp->if_index);
10590 			if_index_available = ifp->if_index;
10591 		} else {
10592 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10593 			    __func__, if_name(ifp), ifp->if_index);
10594 		}
10595 	}
10596 	ifnet_lock_done(ifp);
10597 
10598 	/*
10599 	 * Check if the TCP connections going on this interface should be
10600 	 * forced to send probe packets instead of waiting for TCP timers
10601 	 * to fire. This is done on an explicit notification such as
10602 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10603 	 */
10604 	if (if_index_available > 0) {
10605 		tcp_interface_send_probe(if_index_available);
10606 	}
10607 
10608 	return 0;
10609 }
10610 
10611 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10612 if_get_state(struct ifnet *ifp,
10613     struct if_interface_state *if_interface_state)
10614 {
10615 	ifnet_lock_shared(ifp);
10616 
10617 	if_interface_state->valid_bitmask = 0;
10618 
10619 	if (ifp->if_interface_state.valid_bitmask &
10620 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10621 		if_interface_state->valid_bitmask |=
10622 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10623 		if_interface_state->rrc_state =
10624 		    ifp->if_interface_state.rrc_state;
10625 	}
10626 	if (ifp->if_interface_state.valid_bitmask &
10627 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10628 		if_interface_state->valid_bitmask |=
10629 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10630 		if_interface_state->lqm_state =
10631 		    ifp->if_interface_state.lqm_state;
10632 	}
10633 	if (ifp->if_interface_state.valid_bitmask &
10634 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10635 		if_interface_state->valid_bitmask |=
10636 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10637 		if_interface_state->interface_availability =
10638 		    ifp->if_interface_state.interface_availability;
10639 	}
10640 
10641 	ifnet_lock_done(ifp);
10642 }
10643 
10644 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10645 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10646 {
10647 	if (conn_probe > 1) {
10648 		return EINVAL;
10649 	}
10650 	if (conn_probe == 0) {
10651 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10652 	} else {
10653 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10654 	}
10655 
10656 #if NECP
10657 	necp_update_all_clients();
10658 #endif /* NECP */
10659 
10660 	tcp_probe_connectivity(ifp, conn_probe);
10661 	return 0;
10662 }
10663 
10664 /* for uuid.c */
10665 static int
get_ether_index(int * ret_other_index)10666 get_ether_index(int * ret_other_index)
10667 {
10668 	struct ifnet *ifp;
10669 	int en0_index = 0;
10670 	int other_en_index = 0;
10671 	int any_ether_index = 0;
10672 	short best_unit = 0;
10673 
10674 	*ret_other_index = 0;
10675 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10676 		/*
10677 		 * find en0, or if not en0, the lowest unit en*, and if not
10678 		 * that, any ethernet
10679 		 */
10680 		ifnet_lock_shared(ifp);
10681 		if (strcmp(ifp->if_name, "en") == 0) {
10682 			if (ifp->if_unit == 0) {
10683 				/* found en0, we're done */
10684 				en0_index = ifp->if_index;
10685 				ifnet_lock_done(ifp);
10686 				break;
10687 			}
10688 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10689 				other_en_index = ifp->if_index;
10690 				best_unit = ifp->if_unit;
10691 			}
10692 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10693 			any_ether_index = ifp->if_index;
10694 		}
10695 		ifnet_lock_done(ifp);
10696 	}
10697 	if (en0_index == 0) {
10698 		if (other_en_index != 0) {
10699 			*ret_other_index = other_en_index;
10700 		} else if (any_ether_index != 0) {
10701 			*ret_other_index = any_ether_index;
10702 		}
10703 	}
10704 	return en0_index;
10705 }
10706 
10707 int
uuid_get_ethernet(u_int8_t * node)10708 uuid_get_ethernet(u_int8_t *node)
10709 {
10710 	static int en0_index;
10711 	struct ifnet *ifp;
10712 	int other_index = 0;
10713 	int the_index = 0;
10714 	int ret;
10715 
10716 	ifnet_head_lock_shared();
10717 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10718 		en0_index = get_ether_index(&other_index);
10719 	}
10720 	if (en0_index != 0) {
10721 		the_index = en0_index;
10722 	} else if (other_index != 0) {
10723 		the_index = other_index;
10724 	}
10725 	if (the_index != 0) {
10726 		struct dlil_ifnet *dl_if;
10727 
10728 		ifp = ifindex2ifnet[the_index];
10729 		VERIFY(ifp != NULL);
10730 		dl_if = (struct dlil_ifnet *)ifp;
10731 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10732 			/*
10733 			 * Use the permanent ethernet address if it is
10734 			 * available because it will never change.
10735 			 */
10736 			memcpy(node, dl_if->dl_if_permanent_ether,
10737 			    ETHER_ADDR_LEN);
10738 		} else {
10739 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10740 		}
10741 		ret = 0;
10742 	} else {
10743 		ret = -1;
10744 	}
10745 	ifnet_head_done();
10746 	return ret;
10747 }
10748 
10749 static int
10750 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10751 {
10752 #pragma unused(arg1, arg2)
10753 	uint32_t i;
10754 	int err;
10755 
10756 	i = if_rxpoll;
10757 
10758 	err = sysctl_handle_int(oidp, &i, 0, req);
10759 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10760 		return err;
10761 	}
10762 
10763 	if (net_rxpoll == 0) {
10764 		return ENXIO;
10765 	}
10766 
10767 	if_rxpoll = i;
10768 	return err;
10769 }
10770 
10771 static int
10772 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10773 {
10774 #pragma unused(arg1, arg2)
10775 	uint64_t q;
10776 	int err;
10777 
10778 	q = if_rxpoll_mode_holdtime;
10779 
10780 	err = sysctl_handle_quad(oidp, &q, 0, req);
10781 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10782 		return err;
10783 	}
10784 
10785 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10786 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10787 	}
10788 
10789 	if_rxpoll_mode_holdtime = q;
10790 
10791 	return err;
10792 }
10793 
10794 static int
10795 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10796 {
10797 #pragma unused(arg1, arg2)
10798 	uint64_t q;
10799 	int err;
10800 
10801 	q = if_rxpoll_sample_holdtime;
10802 
10803 	err = sysctl_handle_quad(oidp, &q, 0, req);
10804 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10805 		return err;
10806 	}
10807 
10808 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10809 		q = IF_RXPOLL_SAMPLETIME_MIN;
10810 	}
10811 
10812 	if_rxpoll_sample_holdtime = q;
10813 
10814 	return err;
10815 }
10816 
10817 static int
10818 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10819 {
10820 #pragma unused(arg1, arg2)
10821 	uint64_t q;
10822 	int err;
10823 
10824 	q = if_rxpoll_interval_time;
10825 
10826 	err = sysctl_handle_quad(oidp, &q, 0, req);
10827 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10828 		return err;
10829 	}
10830 
10831 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10832 		q = IF_RXPOLL_INTERVALTIME_MIN;
10833 	}
10834 
10835 	if_rxpoll_interval_time = q;
10836 
10837 	return err;
10838 }
10839 
10840 static int
10841 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10842 {
10843 #pragma unused(arg1, arg2)
10844 	uint32_t i;
10845 	int err;
10846 
10847 	i = if_sysctl_rxpoll_wlowat;
10848 
10849 	err = sysctl_handle_int(oidp, &i, 0, req);
10850 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10851 		return err;
10852 	}
10853 
10854 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10855 		return EINVAL;
10856 	}
10857 
10858 	if_sysctl_rxpoll_wlowat = i;
10859 	return err;
10860 }
10861 
10862 static int
10863 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10864 {
10865 #pragma unused(arg1, arg2)
10866 	uint32_t i;
10867 	int err;
10868 
10869 	i = if_sysctl_rxpoll_whiwat;
10870 
10871 	err = sysctl_handle_int(oidp, &i, 0, req);
10872 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10873 		return err;
10874 	}
10875 
10876 	if (i <= if_sysctl_rxpoll_wlowat) {
10877 		return EINVAL;
10878 	}
10879 
10880 	if_sysctl_rxpoll_whiwat = i;
10881 	return err;
10882 }
10883 
10884 static int
10885 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10886 {
10887 #pragma unused(arg1, arg2)
10888 	int i, err;
10889 
10890 	i = if_sndq_maxlen;
10891 
10892 	err = sysctl_handle_int(oidp, &i, 0, req);
10893 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10894 		return err;
10895 	}
10896 
10897 	if (i < IF_SNDQ_MINLEN) {
10898 		i = IF_SNDQ_MINLEN;
10899 	}
10900 
10901 	if_sndq_maxlen = i;
10902 	return err;
10903 }
10904 
10905 static int
10906 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10907 {
10908 #pragma unused(arg1, arg2)
10909 	int i, err;
10910 
10911 	i = if_rcvq_maxlen;
10912 
10913 	err = sysctl_handle_int(oidp, &i, 0, req);
10914 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10915 		return err;
10916 	}
10917 
10918 	if (i < IF_RCVQ_MINLEN) {
10919 		i = IF_RCVQ_MINLEN;
10920 	}
10921 
10922 	if_rcvq_maxlen = i;
10923 	return err;
10924 }
10925 
10926 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10927 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10928     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10929 {
10930 	struct kev_dl_node_presence kev;
10931 	struct sockaddr_dl *sdl;
10932 	struct sockaddr_in6 *sin6;
10933 	int ret = 0;
10934 
10935 	VERIFY(ifp);
10936 	VERIFY(sa);
10937 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10938 
10939 	bzero(&kev, sizeof(kev));
10940 	sin6 = &kev.sin6_node_address;
10941 	sdl = &kev.sdl_node_address;
10942 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10943 	kev.rssi = rssi;
10944 	kev.link_quality_metric = lqm;
10945 	kev.node_proximity_metric = npm;
10946 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10947 
10948 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10949 	if (ret == 0 || ret == EEXIST) {
10950 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10951 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10952 		if (err != 0) {
10953 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10954 			    "error %d\n", __func__, err);
10955 		}
10956 	}
10957 
10958 	if (ret == EEXIST) {
10959 		ret = 0;
10960 	}
10961 	return ret;
10962 }
10963 
10964 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10965 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10966 {
10967 	struct kev_dl_node_absence kev = {};
10968 	struct sockaddr_in6 *kev_sin6 = NULL;
10969 	struct sockaddr_dl *kev_sdl = NULL;
10970 	int error = 0;
10971 
10972 	VERIFY(ifp != NULL);
10973 	VERIFY(sa != NULL);
10974 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10975 
10976 	kev_sin6 = &kev.sin6_node_address;
10977 	kev_sdl = &kev.sdl_node_address;
10978 
10979 	if (sa->sa_family == AF_INET6) {
10980 		/*
10981 		 * If IPv6 address is given, get the link layer
10982 		 * address from what was cached in the neighbor cache
10983 		 */
10984 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10985 		bcopy(sa, kev_sin6, sa->sa_len);
10986 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10987 	} else {
10988 		/*
10989 		 * If passed address is AF_LINK type, derive the address
10990 		 * based on the link address.
10991 		 */
10992 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10993 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10994 	}
10995 
10996 	if (error == 0) {
10997 		kev_sdl->sdl_type = ifp->if_type;
10998 		kev_sdl->sdl_index = ifp->if_index;
10999 
11000 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11001 		    &kev.link_data, sizeof(kev), FALSE);
11002 	}
11003 }
11004 
11005 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11006 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11007     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11008 {
11009 	struct kev_dl_node_presence kev = {};
11010 	struct sockaddr_dl *kev_sdl = NULL;
11011 	struct sockaddr_in6 *kev_sin6 = NULL;
11012 	int ret = 0;
11013 
11014 	VERIFY(ifp != NULL);
11015 	VERIFY(sa != NULL && sdl != NULL);
11016 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11017 
11018 	kev_sin6 = &kev.sin6_node_address;
11019 	kev_sdl = &kev.sdl_node_address;
11020 
11021 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11022 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11023 	kev_sdl->sdl_type = ifp->if_type;
11024 	kev_sdl->sdl_index = ifp->if_index;
11025 
11026 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11027 	bcopy(sa, kev_sin6, sa->sa_len);
11028 
11029 	kev.rssi = rssi;
11030 	kev.link_quality_metric = lqm;
11031 	kev.node_proximity_metric = npm;
11032 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11033 
11034 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11035 	if (ret == 0 || ret == EEXIST) {
11036 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11037 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11038 		if (err != 0) {
11039 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11040 		}
11041 	}
11042 
11043 	if (ret == EEXIST) {
11044 		ret = 0;
11045 	}
11046 	return ret;
11047 }
11048 
11049 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11050 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11051     kauth_cred_t *credp)
11052 {
11053 	const u_int8_t *bytes;
11054 	size_t size;
11055 
11056 	bytes = CONST_LLADDR(sdl);
11057 	size = sdl->sdl_alen;
11058 
11059 #if CONFIG_MACF
11060 	if (dlil_lladdr_ckreq) {
11061 		switch (sdl->sdl_type) {
11062 		case IFT_ETHER:
11063 		case IFT_IEEE1394:
11064 			break;
11065 		default:
11066 			credp = NULL;
11067 			break;
11068 		}
11069 		;
11070 
11071 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11072 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11073 				[0] = 2
11074 			};
11075 
11076 			bytes = unspec;
11077 		}
11078 	}
11079 #else
11080 #pragma unused(credp)
11081 #endif
11082 
11083 	if (sizep != NULL) {
11084 		*sizep = size;
11085 	}
11086 	return bytes;
11087 }
11088 
11089 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11090 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11091     u_int8_t info[DLIL_MODARGLEN])
11092 {
11093 	struct kev_dl_issues kev;
11094 	struct timeval tv;
11095 
11096 	VERIFY(ifp != NULL);
11097 	VERIFY(modid != NULL);
11098 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11099 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11100 
11101 	bzero(&kev, sizeof(kev));
11102 
11103 	microtime(&tv);
11104 	kev.timestamp = tv.tv_sec;
11105 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11106 	if (info != NULL) {
11107 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11108 	}
11109 
11110 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11111 	    &kev.link_data, sizeof(kev), FALSE);
11112 }
11113 
11114 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11115 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11116     struct proc *p)
11117 {
11118 	u_int32_t level = IFNET_THROTTLE_OFF;
11119 	errno_t result = 0;
11120 
11121 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11122 
11123 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11124 		/*
11125 		 * XXX: Use priv_check_cred() instead of root check?
11126 		 */
11127 		if ((result = proc_suser(p)) != 0) {
11128 			return result;
11129 		}
11130 
11131 		if (ifr->ifr_opportunistic.ifo_flags ==
11132 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11133 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11134 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11135 			level = IFNET_THROTTLE_OFF;
11136 		} else {
11137 			result = EINVAL;
11138 		}
11139 
11140 		if (result == 0) {
11141 			result = ifnet_set_throttle(ifp, level);
11142 		}
11143 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11144 		ifr->ifr_opportunistic.ifo_flags = 0;
11145 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11146 			ifr->ifr_opportunistic.ifo_flags |=
11147 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11148 		}
11149 	}
11150 
11151 	/*
11152 	 * Return the count of current opportunistic connections
11153 	 * over the interface.
11154 	 */
11155 	if (result == 0) {
11156 		uint32_t flags = 0;
11157 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11158 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11159 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11160 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11161 		ifr->ifr_opportunistic.ifo_inuse =
11162 		    udp_count_opportunistic(ifp->if_index, flags) +
11163 		    tcp_count_opportunistic(ifp->if_index, flags);
11164 	}
11165 
11166 	if (result == EALREADY) {
11167 		result = 0;
11168 	}
11169 
11170 	return result;
11171 }
11172 
11173 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11174 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11175 {
11176 	struct ifclassq *ifq;
11177 	int err = 0;
11178 
11179 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11180 		return ENXIO;
11181 	}
11182 
11183 	*level = IFNET_THROTTLE_OFF;
11184 
11185 	ifq = ifp->if_snd;
11186 	IFCQ_LOCK(ifq);
11187 	/* Throttling works only for IFCQ, not ALTQ instances */
11188 	if (IFCQ_IS_ENABLED(ifq)) {
11189 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11190 
11191 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11192 		*level = req.level;
11193 	}
11194 	IFCQ_UNLOCK(ifq);
11195 
11196 	return err;
11197 }
11198 
11199 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11200 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11201 {
11202 	struct ifclassq *ifq;
11203 	int err = 0;
11204 
11205 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11206 		return ENXIO;
11207 	}
11208 
11209 	ifq = ifp->if_snd;
11210 
11211 	switch (level) {
11212 	case IFNET_THROTTLE_OFF:
11213 	case IFNET_THROTTLE_OPPORTUNISTIC:
11214 		break;
11215 	default:
11216 		return EINVAL;
11217 	}
11218 
11219 	IFCQ_LOCK(ifq);
11220 	if (IFCQ_IS_ENABLED(ifq)) {
11221 		cqrq_throttle_t req = { 1, level };
11222 
11223 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11224 	}
11225 	IFCQ_UNLOCK(ifq);
11226 
11227 	if (err == 0) {
11228 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11229 		    level);
11230 #if NECP
11231 		necp_update_all_clients();
11232 #endif /* NECP */
11233 		if (level == IFNET_THROTTLE_OFF) {
11234 			ifnet_start(ifp);
11235 		}
11236 	}
11237 
11238 	return err;
11239 }
11240 
11241 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11242 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11243     struct proc *p)
11244 {
11245 #pragma unused(p)
11246 	errno_t result = 0;
11247 	uint32_t flags;
11248 	int level, category, subcategory;
11249 
11250 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11251 
11252 	if (cmd == SIOCSIFLOG) {
11253 		if ((result = priv_check_cred(kauth_cred_get(),
11254 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11255 			return result;
11256 		}
11257 
11258 		level = ifr->ifr_log.ifl_level;
11259 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11260 			result = EINVAL;
11261 		}
11262 
11263 		flags = ifr->ifr_log.ifl_flags;
11264 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11265 			result = EINVAL;
11266 		}
11267 
11268 		category = ifr->ifr_log.ifl_category;
11269 		subcategory = ifr->ifr_log.ifl_subcategory;
11270 
11271 		if (result == 0) {
11272 			result = ifnet_set_log(ifp, level, flags,
11273 			    category, subcategory);
11274 		}
11275 	} else {
11276 		result = ifnet_get_log(ifp, &level, &flags, &category,
11277 		    &subcategory);
11278 		if (result == 0) {
11279 			ifr->ifr_log.ifl_level = level;
11280 			ifr->ifr_log.ifl_flags = flags;
11281 			ifr->ifr_log.ifl_category = category;
11282 			ifr->ifr_log.ifl_subcategory = subcategory;
11283 		}
11284 	}
11285 
11286 	return result;
11287 }
11288 
11289 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11290 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11291     int32_t category, int32_t subcategory)
11292 {
11293 	int err = 0;
11294 
11295 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11296 	VERIFY(flags & IFNET_LOGF_MASK);
11297 
11298 	/*
11299 	 * The logging level applies to all facilities; make sure to
11300 	 * update them all with the most current level.
11301 	 */
11302 	flags |= ifp->if_log.flags;
11303 
11304 	if (ifp->if_output_ctl != NULL) {
11305 		struct ifnet_log_params l;
11306 
11307 		bzero(&l, sizeof(l));
11308 		l.level = level;
11309 		l.flags = flags;
11310 		l.flags &= ~IFNET_LOGF_DLIL;
11311 		l.category = category;
11312 		l.subcategory = subcategory;
11313 
11314 		/* Send this request to lower layers */
11315 		if (l.flags != 0) {
11316 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11317 			    sizeof(l), &l);
11318 		}
11319 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11320 		/*
11321 		 * If targeted to the lower layers without an output
11322 		 * control callback registered on the interface, just
11323 		 * silently ignore facilities other than ours.
11324 		 */
11325 		flags &= IFNET_LOGF_DLIL;
11326 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11327 			level = 0;
11328 		}
11329 	}
11330 
11331 	if (err == 0) {
11332 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11333 			ifp->if_log.flags = 0;
11334 		} else {
11335 			ifp->if_log.flags |= flags;
11336 		}
11337 
11338 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11339 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11340 		    ifp->if_log.level, ifp->if_log.flags,
11341 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11342 		    category, subcategory);
11343 	}
11344 
11345 	return err;
11346 }
11347 
11348 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11349 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11350     int32_t *category, int32_t *subcategory)
11351 {
11352 	if (level != NULL) {
11353 		*level = ifp->if_log.level;
11354 	}
11355 	if (flags != NULL) {
11356 		*flags = ifp->if_log.flags;
11357 	}
11358 	if (category != NULL) {
11359 		*category = ifp->if_log.category;
11360 	}
11361 	if (subcategory != NULL) {
11362 		*subcategory = ifp->if_log.subcategory;
11363 	}
11364 
11365 	return 0;
11366 }
11367 
11368 int
ifnet_notify_address(struct ifnet * ifp,int af)11369 ifnet_notify_address(struct ifnet *ifp, int af)
11370 {
11371 	struct ifnet_notify_address_params na;
11372 
11373 #if PF
11374 	(void) pf_ifaddr_hook(ifp);
11375 #endif /* PF */
11376 
11377 	if (ifp->if_output_ctl == NULL) {
11378 		return EOPNOTSUPP;
11379 	}
11380 
11381 	bzero(&na, sizeof(na));
11382 	na.address_family = (sa_family_t)af;
11383 
11384 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11385 	           sizeof(na), &na);
11386 }
11387 
11388 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11389 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11390 {
11391 	if (ifp == NULL || flowid == NULL) {
11392 		return EINVAL;
11393 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11394 	    !IF_FULLY_ATTACHED(ifp)) {
11395 		return ENXIO;
11396 	}
11397 
11398 	*flowid = ifp->if_flowhash;
11399 
11400 	return 0;
11401 }
11402 
11403 errno_t
ifnet_disable_output(struct ifnet * ifp)11404 ifnet_disable_output(struct ifnet *ifp)
11405 {
11406 	int err;
11407 
11408 	if (ifp == NULL) {
11409 		return EINVAL;
11410 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11411 	    !IF_FULLY_ATTACHED(ifp)) {
11412 		return ENXIO;
11413 	}
11414 
11415 	if ((err = ifnet_fc_add(ifp)) == 0) {
11416 		lck_mtx_lock_spin(&ifp->if_start_lock);
11417 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11418 		lck_mtx_unlock(&ifp->if_start_lock);
11419 	}
11420 	return err;
11421 }
11422 
11423 errno_t
ifnet_enable_output(struct ifnet * ifp)11424 ifnet_enable_output(struct ifnet *ifp)
11425 {
11426 	if (ifp == NULL) {
11427 		return EINVAL;
11428 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11429 	    !IF_FULLY_ATTACHED(ifp)) {
11430 		return ENXIO;
11431 	}
11432 
11433 	ifnet_start_common(ifp, TRUE);
11434 	return 0;
11435 }
11436 
11437 void
ifnet_flowadv(uint32_t flowhash)11438 ifnet_flowadv(uint32_t flowhash)
11439 {
11440 	struct ifnet_fc_entry *ifce;
11441 	struct ifnet *ifp;
11442 
11443 	ifce = ifnet_fc_get(flowhash);
11444 	if (ifce == NULL) {
11445 		return;
11446 	}
11447 
11448 	VERIFY(ifce->ifce_ifp != NULL);
11449 	ifp = ifce->ifce_ifp;
11450 
11451 	/* flow hash gets recalculated per attach, so check */
11452 	if (ifnet_is_attached(ifp, 1)) {
11453 		if (ifp->if_flowhash == flowhash) {
11454 			(void) ifnet_enable_output(ifp);
11455 		}
11456 		ifnet_decr_iorefcnt(ifp);
11457 	}
11458 	ifnet_fc_entry_free(ifce);
11459 }
11460 
11461 /*
11462  * Function to compare ifnet_fc_entries in ifnet flow control tree
11463  */
11464 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11465 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11466 {
11467 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11468 }
11469 
11470 static int
ifnet_fc_add(struct ifnet * ifp)11471 ifnet_fc_add(struct ifnet *ifp)
11472 {
11473 	struct ifnet_fc_entry keyfc, *ifce;
11474 	uint32_t flowhash;
11475 
11476 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11477 	VERIFY(ifp->if_flowhash != 0);
11478 	flowhash = ifp->if_flowhash;
11479 
11480 	bzero(&keyfc, sizeof(keyfc));
11481 	keyfc.ifce_flowhash = flowhash;
11482 
11483 	lck_mtx_lock_spin(&ifnet_fc_lock);
11484 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11485 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11486 		/* Entry is already in ifnet_fc_tree, return */
11487 		lck_mtx_unlock(&ifnet_fc_lock);
11488 		return 0;
11489 	}
11490 
11491 	if (ifce != NULL) {
11492 		/*
11493 		 * There is a different fc entry with the same flow hash
11494 		 * but different ifp pointer.  There can be a collision
11495 		 * on flow hash but the probability is low.  Let's just
11496 		 * avoid adding a second one when there is a collision.
11497 		 */
11498 		lck_mtx_unlock(&ifnet_fc_lock);
11499 		return EAGAIN;
11500 	}
11501 
11502 	/* become regular mutex */
11503 	lck_mtx_convert_spin(&ifnet_fc_lock);
11504 
11505 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11506 	ifce->ifce_flowhash = flowhash;
11507 	ifce->ifce_ifp = ifp;
11508 
11509 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11510 	lck_mtx_unlock(&ifnet_fc_lock);
11511 	return 0;
11512 }
11513 
11514 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11515 ifnet_fc_get(uint32_t flowhash)
11516 {
11517 	struct ifnet_fc_entry keyfc, *ifce;
11518 	struct ifnet *ifp;
11519 
11520 	bzero(&keyfc, sizeof(keyfc));
11521 	keyfc.ifce_flowhash = flowhash;
11522 
11523 	lck_mtx_lock_spin(&ifnet_fc_lock);
11524 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11525 	if (ifce == NULL) {
11526 		/* Entry is not present in ifnet_fc_tree, return */
11527 		lck_mtx_unlock(&ifnet_fc_lock);
11528 		return NULL;
11529 	}
11530 
11531 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11532 
11533 	VERIFY(ifce->ifce_ifp != NULL);
11534 	ifp = ifce->ifce_ifp;
11535 
11536 	/* become regular mutex */
11537 	lck_mtx_convert_spin(&ifnet_fc_lock);
11538 
11539 	if (!ifnet_is_attached(ifp, 0)) {
11540 		/*
11541 		 * This ifp is not attached or in the process of being
11542 		 * detached; just don't process it.
11543 		 */
11544 		ifnet_fc_entry_free(ifce);
11545 		ifce = NULL;
11546 	}
11547 	lck_mtx_unlock(&ifnet_fc_lock);
11548 
11549 	return ifce;
11550 }
11551 
11552 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11553 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11554 {
11555 	zfree(ifnet_fc_zone, ifce);
11556 }
11557 
11558 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11559 ifnet_calc_flowhash(struct ifnet *ifp)
11560 {
11561 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11562 	uint32_t flowhash = 0;
11563 
11564 	if (ifnet_flowhash_seed == 0) {
11565 		ifnet_flowhash_seed = RandomULong();
11566 	}
11567 
11568 	bzero(&fh, sizeof(fh));
11569 
11570 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11571 	fh.ifk_unit = ifp->if_unit;
11572 	fh.ifk_flags = ifp->if_flags;
11573 	fh.ifk_eflags = ifp->if_eflags;
11574 	fh.ifk_capabilities = ifp->if_capabilities;
11575 	fh.ifk_capenable = ifp->if_capenable;
11576 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11577 	fh.ifk_rand1 = RandomULong();
11578 	fh.ifk_rand2 = RandomULong();
11579 
11580 try_again:
11581 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11582 	if (flowhash == 0) {
11583 		/* try to get a non-zero flowhash */
11584 		ifnet_flowhash_seed = RandomULong();
11585 		goto try_again;
11586 	}
11587 
11588 	return flowhash;
11589 }
11590 
11591 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11592 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11593     uint16_t flags, uint8_t *data)
11594 {
11595 #pragma unused(flags)
11596 	int error = 0;
11597 
11598 	switch (family) {
11599 	case AF_INET:
11600 		if_inetdata_lock_exclusive(ifp);
11601 		if (IN_IFEXTRA(ifp) != NULL) {
11602 			if (len == 0) {
11603 				/* Allow clearing the signature */
11604 				IN_IFEXTRA(ifp)->netsig_len = 0;
11605 				bzero(IN_IFEXTRA(ifp)->netsig,
11606 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11607 				if_inetdata_lock_done(ifp);
11608 				break;
11609 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11610 				error = EINVAL;
11611 				if_inetdata_lock_done(ifp);
11612 				break;
11613 			}
11614 			IN_IFEXTRA(ifp)->netsig_len = len;
11615 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11616 		} else {
11617 			error = ENOMEM;
11618 		}
11619 		if_inetdata_lock_done(ifp);
11620 		break;
11621 
11622 	case AF_INET6:
11623 		if_inet6data_lock_exclusive(ifp);
11624 		if (IN6_IFEXTRA(ifp) != NULL) {
11625 			if (len == 0) {
11626 				/* Allow clearing the signature */
11627 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11628 				bzero(IN6_IFEXTRA(ifp)->netsig,
11629 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11630 				if_inet6data_lock_done(ifp);
11631 				break;
11632 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11633 				error = EINVAL;
11634 				if_inet6data_lock_done(ifp);
11635 				break;
11636 			}
11637 			IN6_IFEXTRA(ifp)->netsig_len = len;
11638 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11639 		} else {
11640 			error = ENOMEM;
11641 		}
11642 		if_inet6data_lock_done(ifp);
11643 		break;
11644 
11645 	default:
11646 		error = EINVAL;
11647 		break;
11648 	}
11649 
11650 	return error;
11651 }
11652 
11653 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11654 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11655     uint16_t *flags, uint8_t *data)
11656 {
11657 	int error = 0;
11658 
11659 	if (ifp == NULL || len == NULL || data == NULL) {
11660 		return EINVAL;
11661 	}
11662 
11663 	switch (family) {
11664 	case AF_INET:
11665 		if_inetdata_lock_shared(ifp);
11666 		if (IN_IFEXTRA(ifp) != NULL) {
11667 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11668 				error = EINVAL;
11669 				if_inetdata_lock_done(ifp);
11670 				break;
11671 			}
11672 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11673 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11674 			} else {
11675 				error = ENOENT;
11676 			}
11677 		} else {
11678 			error = ENOMEM;
11679 		}
11680 		if_inetdata_lock_done(ifp);
11681 		break;
11682 
11683 	case AF_INET6:
11684 		if_inet6data_lock_shared(ifp);
11685 		if (IN6_IFEXTRA(ifp) != NULL) {
11686 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11687 				error = EINVAL;
11688 				if_inet6data_lock_done(ifp);
11689 				break;
11690 			}
11691 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11692 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11693 			} else {
11694 				error = ENOENT;
11695 			}
11696 		} else {
11697 			error = ENOMEM;
11698 		}
11699 		if_inet6data_lock_done(ifp);
11700 		break;
11701 
11702 	default:
11703 		error = EINVAL;
11704 		break;
11705 	}
11706 
11707 	if (error == 0 && flags != NULL) {
11708 		*flags = 0;
11709 	}
11710 
11711 	return error;
11712 }
11713 
11714 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11715 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11716 {
11717 	int i, error = 0, one_set = 0;
11718 
11719 	if_inet6data_lock_exclusive(ifp);
11720 
11721 	if (IN6_IFEXTRA(ifp) == NULL) {
11722 		error = ENOMEM;
11723 		goto out;
11724 	}
11725 
11726 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11727 		uint32_t prefix_len =
11728 		    prefixes[i].prefix_len;
11729 		struct in6_addr *prefix =
11730 		    &prefixes[i].ipv6_prefix;
11731 
11732 		if (prefix_len == 0) {
11733 			clat_log0((LOG_DEBUG,
11734 			    "NAT64 prefixes purged from Interface %s\n",
11735 			    if_name(ifp)));
11736 			/* Allow clearing the signature */
11737 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11738 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11739 			    sizeof(struct in6_addr));
11740 
11741 			continue;
11742 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11743 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11744 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11745 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11746 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11747 		    prefix_len != NAT64_PREFIX_LEN_96) {
11748 			clat_log0((LOG_DEBUG,
11749 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11750 			error = EINVAL;
11751 			goto out;
11752 		}
11753 
11754 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11755 			clat_log0((LOG_DEBUG,
11756 			    "NAT64 prefix has interface/link local scope.\n"));
11757 			error = EINVAL;
11758 			goto out;
11759 		}
11760 
11761 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11762 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11763 		    sizeof(struct in6_addr));
11764 		clat_log0((LOG_DEBUG,
11765 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11766 		    ip6_sprintf(prefix), prefix_len));
11767 		one_set = 1;
11768 	}
11769 
11770 out:
11771 	if_inet6data_lock_done(ifp);
11772 
11773 	if (error == 0 && one_set != 0) {
11774 		necp_update_all_clients();
11775 	}
11776 
11777 	return error;
11778 }
11779 
11780 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11781 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11782 {
11783 	int i, found_one = 0, error = 0;
11784 
11785 	if (ifp == NULL) {
11786 		return EINVAL;
11787 	}
11788 
11789 	if_inet6data_lock_shared(ifp);
11790 
11791 	if (IN6_IFEXTRA(ifp) == NULL) {
11792 		error = ENOMEM;
11793 		goto out;
11794 	}
11795 
11796 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11797 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11798 			found_one = 1;
11799 		}
11800 	}
11801 
11802 	if (found_one == 0) {
11803 		error = ENOENT;
11804 		goto out;
11805 	}
11806 
11807 	if (prefixes) {
11808 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11809 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11810 	}
11811 
11812 out:
11813 	if_inet6data_lock_done(ifp);
11814 
11815 	return error;
11816 }
11817 
11818 __attribute__((noinline))
11819 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11820 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11821     protocol_family_t pf)
11822 {
11823 #pragma unused(ifp)
11824 	uint32_t did_sw;
11825 
11826 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11827 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11828 		return;
11829 	}
11830 
11831 	switch (pf) {
11832 	case PF_INET:
11833 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11834 		if (did_sw & CSUM_DELAY_IP) {
11835 			hwcksum_dbg_finalized_hdr++;
11836 		}
11837 		if (did_sw & CSUM_DELAY_DATA) {
11838 			hwcksum_dbg_finalized_data++;
11839 		}
11840 		break;
11841 	case PF_INET6:
11842 		/*
11843 		 * Checksum offload should not have been enabled when
11844 		 * extension headers exist; that also means that we
11845 		 * cannot force-finalize packets with extension headers.
11846 		 * Indicate to the callee should it skip such case by
11847 		 * setting optlen to -1.
11848 		 */
11849 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11850 		    m->m_pkthdr.csum_flags);
11851 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11852 			hwcksum_dbg_finalized_data++;
11853 		}
11854 		break;
11855 	default:
11856 		return;
11857 	}
11858 }
11859 
11860 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11861 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11862     protocol_family_t pf)
11863 {
11864 	uint16_t sum = 0;
11865 	uint32_t hlen;
11866 
11867 	if (frame_header == NULL ||
11868 	    frame_header < (char *)mbuf_datastart(m) ||
11869 	    frame_header > (char *)m->m_data) {
11870 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11871 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11872 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11873 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11874 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11875 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11876 		return;
11877 	}
11878 	hlen = (uint32_t)(m->m_data - frame_header);
11879 
11880 	switch (pf) {
11881 	case PF_INET:
11882 	case PF_INET6:
11883 		break;
11884 	default:
11885 		return;
11886 	}
11887 
11888 	/*
11889 	 * Force partial checksum offload; useful to simulate cases
11890 	 * where the hardware does not support partial checksum offload,
11891 	 * in order to validate correctness throughout the layers above.
11892 	 */
11893 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11894 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11895 
11896 		if (foff > (uint32_t)m->m_pkthdr.len) {
11897 			return;
11898 		}
11899 
11900 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11901 
11902 		/* Compute 16-bit 1's complement sum from forced offset */
11903 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11904 
11905 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11906 		m->m_pkthdr.csum_rx_val = sum;
11907 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11908 
11909 		hwcksum_dbg_partial_forced++;
11910 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11911 	}
11912 
11913 	/*
11914 	 * Partial checksum offload verification (and adjustment);
11915 	 * useful to validate and test cases where the hardware
11916 	 * supports partial checksum offload.
11917 	 */
11918 	if ((m->m_pkthdr.csum_flags &
11919 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11920 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11921 		uint32_t rxoff;
11922 
11923 		/* Start offset must begin after frame header */
11924 		rxoff = m->m_pkthdr.csum_rx_start;
11925 		if (hlen > rxoff) {
11926 			hwcksum_dbg_bad_rxoff++;
11927 			if (dlil_verbose) {
11928 				DLIL_PRINTF("%s: partial cksum start offset %d "
11929 				    "is less than frame header length %d for "
11930 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11931 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11932 			}
11933 			return;
11934 		}
11935 		rxoff -= hlen;
11936 
11937 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11938 			/*
11939 			 * Compute the expected 16-bit 1's complement sum;
11940 			 * skip this if we've already computed it above
11941 			 * when partial checksum offload is forced.
11942 			 */
11943 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11944 
11945 			/* Hardware or driver is buggy */
11946 			if (sum != m->m_pkthdr.csum_rx_val) {
11947 				hwcksum_dbg_bad_cksum++;
11948 				if (dlil_verbose) {
11949 					DLIL_PRINTF("%s: bad partial cksum value "
11950 					    "0x%x (expected 0x%x) for mbuf "
11951 					    "0x%llx [rx_start %d]\n",
11952 					    if_name(ifp),
11953 					    m->m_pkthdr.csum_rx_val, sum,
11954 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11955 					    m->m_pkthdr.csum_rx_start);
11956 				}
11957 				return;
11958 			}
11959 		}
11960 		hwcksum_dbg_verified++;
11961 
11962 		/*
11963 		 * This code allows us to emulate various hardwares that
11964 		 * perform 16-bit 1's complement sum beginning at various
11965 		 * start offset values.
11966 		 */
11967 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11968 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11969 
11970 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11971 				return;
11972 			}
11973 
11974 			sum = m_adj_sum16(m, rxoff, aoff,
11975 			    m_pktlen(m) - aoff, sum);
11976 
11977 			m->m_pkthdr.csum_rx_val = sum;
11978 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11979 
11980 			hwcksum_dbg_adjusted++;
11981 		}
11982 	}
11983 }
11984 
11985 static int
11986 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11987 {
11988 #pragma unused(arg1, arg2)
11989 	u_int32_t i;
11990 	int err;
11991 
11992 	i = hwcksum_dbg_mode;
11993 
11994 	err = sysctl_handle_int(oidp, &i, 0, req);
11995 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11996 		return err;
11997 	}
11998 
11999 	if (hwcksum_dbg == 0) {
12000 		return ENODEV;
12001 	}
12002 
12003 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12004 		return EINVAL;
12005 	}
12006 
12007 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12008 
12009 	return err;
12010 }
12011 
12012 static int
12013 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12014 {
12015 #pragma unused(arg1, arg2)
12016 	u_int32_t i;
12017 	int err;
12018 
12019 	i = hwcksum_dbg_partial_rxoff_forced;
12020 
12021 	err = sysctl_handle_int(oidp, &i, 0, req);
12022 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12023 		return err;
12024 	}
12025 
12026 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12027 		return ENODEV;
12028 	}
12029 
12030 	hwcksum_dbg_partial_rxoff_forced = i;
12031 
12032 	return err;
12033 }
12034 
12035 static int
12036 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12037 {
12038 #pragma unused(arg1, arg2)
12039 	u_int32_t i;
12040 	int err;
12041 
12042 	i = hwcksum_dbg_partial_rxoff_adj;
12043 
12044 	err = sysctl_handle_int(oidp, &i, 0, req);
12045 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12046 		return err;
12047 	}
12048 
12049 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12050 		return ENODEV;
12051 	}
12052 
12053 	hwcksum_dbg_partial_rxoff_adj = i;
12054 
12055 	return err;
12056 }
12057 
12058 static int
12059 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12060 {
12061 #pragma unused(oidp, arg1, arg2)
12062 	int err;
12063 
12064 	if (req->oldptr == USER_ADDR_NULL) {
12065 	}
12066 	if (req->newptr != USER_ADDR_NULL) {
12067 		return EPERM;
12068 	}
12069 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12070 	    sizeof(struct chain_len_stats));
12071 
12072 	return err;
12073 }
12074 
12075 
12076 #if DEBUG || DEVELOPMENT
12077 /* Blob for sum16 verification */
12078 static uint8_t sumdata[] = {
12079 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12080 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12081 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12082 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12083 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12084 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12085 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12086 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12087 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12088 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12089 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12090 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12091 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12092 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12093 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12094 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12095 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12096 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12097 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12098 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12099 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12100 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12101 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12102 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12103 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12104 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12105 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12106 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12107 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12108 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12109 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12110 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12111 	0xc8, 0x28, 0x02, 0x00, 0x00
12112 };
12113 
12114 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12115 static struct {
12116 	boolean_t       init;
12117 	uint16_t        len;
12118 	uint16_t        sumr;   /* reference */
12119 	uint16_t        sumrp;  /* reference, precomputed */
12120 } sumtbl[] = {
12121 	{ FALSE, 0, 0, 0x0000 },
12122 	{ FALSE, 1, 0, 0x001f },
12123 	{ FALSE, 2, 0, 0x8b1f },
12124 	{ FALSE, 3, 0, 0x8b27 },
12125 	{ FALSE, 7, 0, 0x790e },
12126 	{ FALSE, 11, 0, 0xcb6d },
12127 	{ FALSE, 20, 0, 0x20dd },
12128 	{ FALSE, 27, 0, 0xbabd },
12129 	{ FALSE, 32, 0, 0xf3e8 },
12130 	{ FALSE, 37, 0, 0x197d },
12131 	{ FALSE, 43, 0, 0x9eae },
12132 	{ FALSE, 64, 0, 0x4678 },
12133 	{ FALSE, 127, 0, 0x9399 },
12134 	{ FALSE, 256, 0, 0xd147 },
12135 	{ FALSE, 325, 0, 0x0358 },
12136 };
12137 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12138 
12139 static void
dlil_verify_sum16(void)12140 dlil_verify_sum16(void)
12141 {
12142 	struct mbuf *m;
12143 	uint8_t *buf;
12144 	int n;
12145 
12146 	/* Make sure test data plus extra room for alignment fits in cluster */
12147 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12148 
12149 	kprintf("DLIL: running SUM16 self-tests ... ");
12150 
12151 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12152 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12153 
12154 	buf = mtod(m, uint8_t *);               /* base address */
12155 
12156 	for (n = 0; n < SUMTBL_MAX; n++) {
12157 		uint16_t len = sumtbl[n].len;
12158 		int i;
12159 
12160 		/* Verify for all possible alignments */
12161 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12162 			uint16_t sum, sumr;
12163 			uint8_t *c;
12164 
12165 			/* Copy over test data to mbuf */
12166 			VERIFY(len <= sizeof(sumdata));
12167 			c = buf + i;
12168 			bcopy(sumdata, c, len);
12169 
12170 			/* Zero-offset test (align by data pointer) */
12171 			m->m_data = (caddr_t)c;
12172 			m->m_len = len;
12173 			sum = m_sum16(m, 0, len);
12174 
12175 			if (!sumtbl[n].init) {
12176 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12177 				sumtbl[n].sumr = sumr;
12178 				sumtbl[n].init = TRUE;
12179 			} else {
12180 				sumr = sumtbl[n].sumr;
12181 			}
12182 
12183 			/* Something is horribly broken; stop now */
12184 			if (sumr != sumtbl[n].sumrp) {
12185 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12186 				    "for len=%d align=%d sum=0x%04x "
12187 				    "[expected=0x%04x]\n", __func__,
12188 				    len, i, sum, sumr);
12189 				/* NOTREACHED */
12190 			} else if (sum != sumr) {
12191 				panic_plain("\n%s: broken m_sum16() for len=%d "
12192 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12193 				    __func__, len, i, sum, sumr);
12194 				/* NOTREACHED */
12195 			}
12196 
12197 			/* Alignment test by offset (fixed data pointer) */
12198 			m->m_data = (caddr_t)buf;
12199 			m->m_len = i + len;
12200 			sum = m_sum16(m, i, len);
12201 
12202 			/* Something is horribly broken; stop now */
12203 			if (sum != sumr) {
12204 				panic_plain("\n%s: broken m_sum16() for len=%d "
12205 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12206 				    __func__, len, i, sum, sumr);
12207 				/* NOTREACHED */
12208 			}
12209 #if INET
12210 			/* Simple sum16 contiguous buffer test by aligment */
12211 			sum = b_sum16(c, len);
12212 
12213 			/* Something is horribly broken; stop now */
12214 			if (sum != sumr) {
12215 				panic_plain("\n%s: broken b_sum16() for len=%d "
12216 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12217 				    __func__, len, i, sum, sumr);
12218 				/* NOTREACHED */
12219 			}
12220 #endif /* INET */
12221 		}
12222 	}
12223 	m_freem(m);
12224 
12225 	kprintf("PASSED\n");
12226 }
12227 #endif /* DEBUG || DEVELOPMENT */
12228 
12229 #define CASE_STRINGIFY(x) case x: return #x
12230 
12231 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12232 dlil_kev_dl_code_str(u_int32_t event_code)
12233 {
12234 	switch (event_code) {
12235 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12236 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12237 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12238 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12239 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12240 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12241 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12242 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12243 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12244 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12245 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12246 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12247 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12248 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12249 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12250 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12251 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12252 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12253 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12254 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12255 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12256 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12257 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12258 		CASE_STRINGIFY(KEV_DL_ISSUES);
12259 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12260 	default:
12261 		break;
12262 	}
12263 	return "";
12264 }
12265 
12266 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12267 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12268 {
12269 #pragma unused(arg1)
12270 	struct ifnet *ifp = arg0;
12271 
12272 	if (ifnet_is_attached(ifp, 1)) {
12273 		nstat_ifnet_threshold_reached(ifp->if_index);
12274 		ifnet_decr_iorefcnt(ifp);
12275 	}
12276 }
12277 
12278 void
ifnet_notify_data_threshold(struct ifnet * ifp)12279 ifnet_notify_data_threshold(struct ifnet *ifp)
12280 {
12281 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12282 	uint64_t oldbytes = ifp->if_dt_bytes;
12283 
12284 	ASSERT(ifp->if_dt_tcall != NULL);
12285 
12286 	/*
12287 	 * If we went over the threshold, notify NetworkStatistics.
12288 	 * We rate-limit it based on the threshold interval value.
12289 	 */
12290 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12291 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12292 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12293 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12294 		uint64_t now = mach_absolute_time(), deadline = now;
12295 		uint64_t ival;
12296 
12297 		if (tival != 0) {
12298 			nanoseconds_to_absolutetime(tival, &ival);
12299 			clock_deadline_for_periodic_event(ival, now, &deadline);
12300 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12301 			    deadline);
12302 		} else {
12303 			(void) thread_call_enter(ifp->if_dt_tcall);
12304 		}
12305 	}
12306 }
12307 
12308 #if (DEVELOPMENT || DEBUG)
12309 /*
12310  * The sysctl variable name contains the input parameters of
12311  * ifnet_get_keepalive_offload_frames()
12312  *  ifp (interface index): name[0]
12313  *  frames_array_count:    name[1]
12314  *  frame_data_offset:     name[2]
12315  * The return length gives used_frames_count
12316  */
12317 static int
12318 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12319 {
12320 #pragma unused(oidp)
12321 	int *name = (int *)arg1;
12322 	u_int namelen = arg2;
12323 	int idx;
12324 	ifnet_t ifp = NULL;
12325 	u_int32_t frames_array_count;
12326 	size_t frame_data_offset;
12327 	u_int32_t used_frames_count;
12328 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12329 	int error = 0;
12330 	u_int32_t i;
12331 
12332 	/*
12333 	 * Only root can get look at other people TCP frames
12334 	 */
12335 	error = proc_suser(current_proc());
12336 	if (error != 0) {
12337 		goto done;
12338 	}
12339 	/*
12340 	 * Validate the input parameters
12341 	 */
12342 	if (req->newptr != USER_ADDR_NULL) {
12343 		error = EPERM;
12344 		goto done;
12345 	}
12346 	if (namelen != 3) {
12347 		error = EINVAL;
12348 		goto done;
12349 	}
12350 	if (req->oldptr == USER_ADDR_NULL) {
12351 		error = EINVAL;
12352 		goto done;
12353 	}
12354 	if (req->oldlen == 0) {
12355 		error = EINVAL;
12356 		goto done;
12357 	}
12358 	idx = name[0];
12359 	frames_array_count = name[1];
12360 	frame_data_offset = name[2];
12361 
12362 	/* Make sure the passed buffer is large enough */
12363 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12364 	    req->oldlen) {
12365 		error = ENOMEM;
12366 		goto done;
12367 	}
12368 
12369 	ifnet_head_lock_shared();
12370 	if (!IF_INDEX_IN_RANGE(idx)) {
12371 		ifnet_head_done();
12372 		error = ENOENT;
12373 		goto done;
12374 	}
12375 	ifp = ifindex2ifnet[idx];
12376 	ifnet_head_done();
12377 
12378 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12379 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12380 		Z_WAITOK);
12381 	if (frames_array == NULL) {
12382 		error = ENOMEM;
12383 		goto done;
12384 	}
12385 
12386 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12387 	    frames_array_count, frame_data_offset, &used_frames_count);
12388 	if (error != 0) {
12389 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12390 		    __func__, error);
12391 		goto done;
12392 	}
12393 
12394 	for (i = 0; i < used_frames_count; i++) {
12395 		error = SYSCTL_OUT(req, frames_array + i,
12396 		    sizeof(struct ifnet_keepalive_offload_frame));
12397 		if (error != 0) {
12398 			goto done;
12399 		}
12400 	}
12401 done:
12402 	if (frames_array != NULL) {
12403 		kfree_data(frames_array, frames_array_count *
12404 		    sizeof(struct ifnet_keepalive_offload_frame));
12405 	}
12406 	return error;
12407 }
12408 #endif /* DEVELOPMENT || DEBUG */
12409 
12410 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12411 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12412     struct ifnet *ifp)
12413 {
12414 	tcp_update_stats_per_flow(ifs, ifp);
12415 }
12416 
12417 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12418 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12419 {
12420 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12421 }
12422 
12423 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12424 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12425 {
12426 	OSBitAndAtomic(~clear_flags, flags_p);
12427 }
12428 
12429 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12430 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12431 {
12432 	return _set_flags(&interface->if_eflags, set_flags);
12433 }
12434 
12435 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12436 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12437 {
12438 	_clear_flags(&interface->if_eflags, clear_flags);
12439 }
12440 
12441 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12442 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12443 {
12444 	return _set_flags(&interface->if_xflags, set_flags);
12445 }
12446 
12447 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12448 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12449 {
12450 	_clear_flags(&interface->if_xflags, clear_flags);
12451 }
12452 
12453 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12454 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12455 {
12456 	atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12457 }
12458 
12459 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12460 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12461 {
12462 	if (*genid != ifp->if_traffic_rule_genid) {
12463 		*genid = ifp->if_traffic_rule_genid;
12464 		return TRUE;
12465 	}
12466 	return FALSE;
12467 }
12468 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12469 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12470 {
12471 	atomic_set_32(&ifp->if_traffic_rule_count, count);
12472 	ifnet_update_traffic_rule_genid(ifp);
12473 }
12474 
12475 static void
log_hexdump(void * data,size_t len)12476 log_hexdump(void *data, size_t len)
12477 {
12478 	size_t i, j, k;
12479 	unsigned char *ptr = (unsigned char *)data;
12480 #define MAX_DUMP_BUF 32
12481 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12482 
12483 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12484 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12485 			unsigned char msnbl = ptr[j] >> 4;
12486 			unsigned char lsnbl = ptr[j] & 0x0f;
12487 
12488 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12489 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12490 
12491 			if ((j % 2) == 1) {
12492 				buf[k++] = ' ';
12493 			}
12494 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12495 				buf[k++] = ' ';
12496 			}
12497 		}
12498 		buf[k] = 0;
12499 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12500 	}
12501 }
12502 
12503 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12504 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12505 net_check_compatible_if_filter(struct ifnet *ifp)
12506 {
12507 	if (ifp == NULL) {
12508 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12509 			return false;
12510 		}
12511 	} else {
12512 		if (ifp->if_flt_non_os_count > 0) {
12513 			return false;
12514 		}
12515 	}
12516 	return true;
12517 }
12518 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12519 
12520 #define DUMP_BUF_CHK() {        \
12521 	clen -= k;              \
12522 	if (clen < 1)           \
12523 	        goto done;      \
12524 	c += k;                 \
12525 }
12526 
12527 int dlil_dump_top_if_qlen(char *, int);
12528 int
dlil_dump_top_if_qlen(char * str,int str_len)12529 dlil_dump_top_if_qlen(char *str, int str_len)
12530 {
12531 	char *c = str;
12532 	int k, clen = str_len;
12533 	struct ifnet *top_ifcq_ifp = NULL;
12534 	uint32_t top_ifcq_len = 0;
12535 	struct ifnet *top_inq_ifp = NULL;
12536 	uint32_t top_inq_len = 0;
12537 
12538 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12539 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12540 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12541 
12542 		if (ifp == NULL) {
12543 			continue;
12544 		}
12545 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12546 			top_ifcq_len = ifp->if_snd->ifcq_len;
12547 			top_ifcq_ifp = ifp;
12548 		}
12549 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12550 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12551 			top_inq_ifp = ifp;
12552 		}
12553 	}
12554 
12555 	if (top_ifcq_ifp != NULL) {
12556 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12557 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12558 		DUMP_BUF_CHK();
12559 	}
12560 	if (top_inq_ifp != NULL) {
12561 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12562 		    top_inq_len, top_inq_ifp->if_xname);
12563 		DUMP_BUF_CHK();
12564 	}
12565 done:
12566 	return str_len - clen;
12567 }
12568 
12569 #if DEVELOPMENT || DEBUG
12570 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12571 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12572 {
12573 	struct flow_key key = {};
12574 	int error = 0;
12575 
12576 	if (req->newptr == USER_ADDR_NULL) {
12577 		return EINVAL;
12578 	}
12579 	if (req->newlen < sizeof(struct flow_key)) {
12580 		return EINVAL;
12581 	}
12582 	error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12583 	if (error != 0) {
12584 		return error;
12585 	}
12586 
12587 	switch (key.fk_ipver) {
12588 	case IPVERSION:
12589 		if (key.fk_proto != IPPROTO_UDP ||
12590 		    key.fk_sport == 0 || key.fk_dport == 0) {
12591 			return EINVAL;
12592 		}
12593 
12594 		if (key.fk_src4.s_addr == INADDR_ANY ||
12595 		    key.fk_dst4.s_addr == INADDR_ANY) {
12596 			return EINVAL;
12597 		}
12598 
12599 		break;
12600 	case IPV6_VERSION:
12601 		if (key.fk_proto != IPPROTO_UDP ||
12602 		    key.fk_sport == 0 || key.fk_dport == 0) {
12603 			return EINVAL;
12604 		}
12605 
12606 		if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12607 		    IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12608 			return EINVAL;
12609 		}
12610 
12611 		break;
12612 	case 0:
12613 		if (key.fk_proto != 0 ||
12614 		    key.fk_sport != 0 || key.fk_dport != 0) {
12615 			return EINVAL;
12616 		}
12617 
12618 		if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12619 		    !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12620 			return EINVAL;
12621 		}
12622 
12623 		break;
12624 	default:
12625 		return EINVAL;
12626 	}
12627 
12628 	memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12629 	return 0;
12630 }
12631 #endif /* DEVELOPMENT || DEBUG */
12632