xref: /xnu-8796.141.3/bsd/net/dlil.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define IFNET_KTRACE_TX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x002)
154 
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR        4 /* LONGWORDS */
157 
158 
159 #if 1
160 #define DLIL_PRINTF     printf
161 #else
162 #define DLIL_PRINTF     kprintf
163 #endif
164 
165 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
166 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167 
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
169 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170 
171 enum {
172 	kProtoKPI_v1    = 1,
173 	kProtoKPI_v2    = 2
174 };
175 
176 /*
177  * List of if_proto structures in if_proto_hash[] is protected by
178  * the ifnet lock.  The rest of the fields are initialized at protocol
179  * attach time and never change, thus no lock required as long as
180  * a reference to it is valid, via if_proto_ref().
181  */
182 struct if_proto {
183 	SLIST_ENTRY(if_proto)       next_hash;
184 	u_int32_t                   refcount;
185 	u_int32_t                   detached;
186 	struct ifnet                *ifp;
187 	protocol_family_t           protocol_family;
188 	int                         proto_kpi;
189 	union {
190 		struct {
191 			proto_media_input               input;
192 			proto_media_preout              pre_output;
193 			proto_media_event               event;
194 			proto_media_ioctl               ioctl;
195 			proto_media_detached            detached;
196 			proto_media_resolve_multi       resolve_multi;
197 			proto_media_send_arp            send_arp;
198 		} v1;
199 		struct {
200 			proto_media_input_v2            input;
201 			proto_media_preout              pre_output;
202 			proto_media_event               event;
203 			proto_media_ioctl               ioctl;
204 			proto_media_detached            detached;
205 			proto_media_resolve_multi       resolve_multi;
206 			proto_media_send_arp            send_arp;
207 		} v2;
208 	} kpi;
209 };
210 
211 SLIST_HEAD(proto_hash_entry, if_proto);
212 
213 #define DLIL_SDLDATALEN \
214 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215 
216 struct dlil_ifnet {
217 	struct ifnet    dl_if;                  /* public ifnet */
218 	/*
219 	 * DLIL private fields, protected by dl_if_lock
220 	 */
221 	decl_lck_mtx_data(, dl_if_lock);
222 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
223 	u_int32_t dl_if_flags;                  /* flags (below) */
224 	u_int32_t dl_if_refcnt;                 /* refcnt */
225 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 	void    *dl_if_uniqueid;                /* unique interface id */
227 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
228 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
229 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
230 	struct {
231 		struct ifaddr   ifa;            /* lladdr ifa */
232 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 	} dl_if_lladdr;
235 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 	u_int8_t dl_if_permanent_ether_is_set;
238 	u_int8_t dl_if_unused;
239 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
241 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
242 };
243 
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG      0x4     /* has debugging info */
248 
249 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
250 
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253 
254 struct dlil_ifnet_dbg {
255 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
256 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
257 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
258 	/*
259 	 * Circular lists of ifnet_{reference,release} callers.
260 	 */
261 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264 
265 #define DLIL_TO_IFP(s)  (&s->dl_if)
266 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
267 
268 struct ifnet_filter {
269 	TAILQ_ENTRY(ifnet_filter)       filt_next;
270 	u_int32_t                       filt_skip;
271 	u_int32_t                       filt_flags;
272 	ifnet_t                         filt_ifp;
273 	const char                      *filt_name;
274 	void                            *filt_cookie;
275 	protocol_family_t               filt_protocol;
276 	iff_input_func                  filt_input;
277 	iff_output_func                 filt_output;
278 	iff_event_func                  filt_event;
279 	iff_ioctl_func                  filt_ioctl;
280 	iff_detached_func               filt_detached;
281 };
282 
283 struct proto_input_entry;
284 
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286 
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288 
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294 
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297     &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299     &dlil_lck_attributes);
300 
301 #if DEBUG
302 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug;        /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
308 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
310 
311 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
312 
313 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
314 
315 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
316 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
317 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
318 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
319 
320 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
321 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
322 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
323 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
324 
325 static u_int32_t net_rtref;
326 
327 static struct dlil_main_threading_info dlil_main_input_thread_info;
328 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
329     (struct dlil_threading_info *)&dlil_main_input_thread_info;
330 
331 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
332 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
333 static void dlil_if_trace(struct dlil_ifnet *, int);
334 static void if_proto_ref(struct if_proto *);
335 static void if_proto_free(struct if_proto *);
336 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
337 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
338     u_int32_t list_count);
339 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
340 static void if_flt_monitor_busy(struct ifnet *);
341 static void if_flt_monitor_unbusy(struct ifnet *);
342 static void if_flt_monitor_enter(struct ifnet *);
343 static void if_flt_monitor_leave(struct ifnet *);
344 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
345     char **, protocol_family_t);
346 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
347     protocol_family_t);
348 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
349     const struct sockaddr_dl *);
350 static int ifnet_lookup(struct ifnet *);
351 static void if_purgeaddrs(struct ifnet *);
352 
353 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
354     struct mbuf *, char *);
355 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
356     struct mbuf *);
357 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
358     mbuf_t *, const struct sockaddr *, void *, char *, char *);
359 static void ifproto_media_event(struct ifnet *, protocol_family_t,
360     const struct kev_msg *);
361 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
362     unsigned long, void *);
363 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
364     struct sockaddr_dl *, size_t);
365 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
366     const struct sockaddr_dl *, const struct sockaddr *,
367     const struct sockaddr_dl *, const struct sockaddr *);
368 
369 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
370     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
371     boolean_t poll, struct thread *tp);
372 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
373     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
374 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
375 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
376     protocol_family_t *);
377 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
378     const struct ifnet_demux_desc *, u_int32_t);
379 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
380 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
381 #if !XNU_TARGET_OS_OSX
382 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
383     const struct sockaddr *, const char *, const char *,
384     u_int32_t *, u_int32_t *);
385 #else /* XNU_TARGET_OS_OSX */
386 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
387     const struct sockaddr *, const char *, const char *);
388 #endif /* XNU_TARGET_OS_OSX */
389 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
390     const struct sockaddr *, const char *, const char *,
391     u_int32_t *, u_int32_t *);
392 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
393 static void ifp_if_free(struct ifnet *);
394 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
395 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
396 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
397 
398 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
399     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
400     boolean_t, struct thread *);
401 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
402     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
403     boolean_t, struct thread *);
404 
405 static void dlil_main_input_thread_func(void *, wait_result_t);
406 static void dlil_main_input_thread_cont(void *, wait_result_t);
407 
408 static void dlil_input_thread_func(void *, wait_result_t);
409 static void dlil_input_thread_cont(void *, wait_result_t);
410 
411 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
412 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
413 
414 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
415     thread_continue_t *);
416 static void dlil_terminate_input_thread(struct dlil_threading_info *);
417 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
418     struct dlil_threading_info *, struct ifnet *, boolean_t);
419 static boolean_t dlil_input_stats_sync(struct ifnet *,
420     struct dlil_threading_info *);
421 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
422     u_int32_t, ifnet_model_t, boolean_t);
423 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
424     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
425 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
426 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
427 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
428 #if DEBUG || DEVELOPMENT
429 static void dlil_verify_sum16(void);
430 #endif /* DEBUG || DEVELOPMENT */
431 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
432     protocol_family_t);
433 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
434     protocol_family_t);
435 
436 static void dlil_incr_pending_thread_count(void);
437 static void dlil_decr_pending_thread_count(void);
438 
439 static void ifnet_detacher_thread_func(void *, wait_result_t);
440 static void ifnet_detacher_thread_cont(void *, wait_result_t);
441 static void ifnet_detach_final(struct ifnet *);
442 static void ifnet_detaching_enqueue(struct ifnet *);
443 static struct ifnet *ifnet_detaching_dequeue(void);
444 
445 static void ifnet_start_thread_func(void *, wait_result_t);
446 static void ifnet_start_thread_cont(void *, wait_result_t);
447 
448 static void ifnet_poll_thread_func(void *, wait_result_t);
449 static void ifnet_poll_thread_cont(void *, wait_result_t);
450 
451 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
452     classq_pkt_t *, boolean_t, boolean_t *);
453 
454 static void ifp_src_route_copyout(struct ifnet *, struct route *);
455 static void ifp_src_route_copyin(struct ifnet *, struct route *);
456 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
457 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
458 
459 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
460 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
461 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
465 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
466 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
467 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
468 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
470 
471 struct chain_len_stats tx_chain_len_stats;
472 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
473 
474 #if TEST_INPUT_THREAD_TERMINATION
475 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
476 #endif /* TEST_INPUT_THREAD_TERMINATION */
477 
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522 
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525 
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527     u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532 
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540 
541 #if DEBUG
542 int dlil_verbose = 1;
543 #else
544 int dlil_verbose = 0;
545 #endif /* DEBUG */
546 #if IFNET_INPUT_SANITY_CHK
547 /* sanity checking of input packet lists received */
548 static u_int32_t dlil_input_sanity_check = 0;
549 #endif /* IFNET_INPUT_SANITY_CHK */
550 /* rate limit debug messages */
551 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
552 
553 SYSCTL_DECL(_net_link_generic_system);
554 
555 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
556     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
557 
558 #define IF_SNDQ_MINLEN  32
559 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
560 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
561     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
562     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
563 
564 #define IF_RCVQ_MINLEN  32
565 #define IF_RCVQ_MAXLEN  256
566 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
568     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
569     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
570 
571 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
572 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
573 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
574     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
575     "ilog2 of EWMA decay rate of avg inbound packets");
576 
577 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
578 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
579 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
580 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
581     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
582     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
583     "Q", "input poll mode freeze time");
584 
585 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
586 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
587 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
588 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
589     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
590     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
591     "Q", "input poll sampling time");
592 
593 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
595     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
596     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
597     "Q", "input poll interval (time)");
598 
599 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
600 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
601 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
602     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
603     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
604 
605 #define IF_RXPOLL_WLOWAT        10
606 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
607 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
608     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
609     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
610     "I", "input poll wakeup low watermark");
611 
612 #define IF_RXPOLL_WHIWAT        100
613 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
614 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
615     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
616     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
617     "I", "input poll wakeup high watermark");
618 
619 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
621     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
622     "max packets per poll call");
623 
624 u_int32_t if_rxpoll = 1;
625 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
626     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
627     sysctl_rxpoll, "I", "enable opportunistic input polling");
628 
629 #if TEST_INPUT_THREAD_TERMINATION
630 static u_int32_t if_input_thread_termination_spin = 0;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
632     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
633     &if_input_thread_termination_spin, 0,
634     sysctl_input_thread_termination_spin,
635     "I", "input thread termination spin limit");
636 #endif /* TEST_INPUT_THREAD_TERMINATION */
637 
638 static u_int32_t cur_dlil_input_threads = 0;
639 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
640     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
641     "Current number of DLIL input threads");
642 
643 #if IFNET_INPUT_SANITY_CHK
644 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
645     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
646     "Turn on sanity checking in DLIL input");
647 #endif /* IFNET_INPUT_SANITY_CHK */
648 
649 static u_int32_t if_flowadv = 1;
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
651     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
652     "enable flow-advisory mechanism");
653 
654 static u_int32_t if_delaybased_queue = 1;
655 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
656     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
657     "enable delay based dynamic queue sizing");
658 
659 static uint64_t hwcksum_in_invalidated = 0;
660 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
662     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
663 
664 uint32_t hwcksum_dbg = 0;
665 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
666     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
667     "enable hardware cksum debugging");
668 
669 u_int32_t ifnet_start_delayed = 0;
670 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
671     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
672     "number of times start was delayed");
673 
674 u_int32_t ifnet_delay_start_disabled = 0;
675 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
676     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
677     "number of times start was delayed");
678 
679 #if DEVELOPMENT || DEBUG
680 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
681 
682 struct flow_key flow_key_trace;
683 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
684     CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
685 #endif /* DEVELOPMENT || DEBUG */
686 
687 static inline void
ifnet_delay_start_disabled_increment(void)688 ifnet_delay_start_disabled_increment(void)
689 {
690 	OSIncrementAtomic(&ifnet_delay_start_disabled);
691 }
692 
693 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
694 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
695 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
696 #define HWCKSUM_DBG_MASK \
697 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
698 	HWCKSUM_DBG_FINALIZE_FORCED)
699 
700 static uint32_t hwcksum_dbg_mode = 0;
701 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
702     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
703     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
704 
705 static uint64_t hwcksum_dbg_partial_forced = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
708     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
709 
710 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
711 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
712     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
713     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
714 
715 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
716 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
717     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
718     &hwcksum_dbg_partial_rxoff_forced, 0,
719     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
720     "forced partial cksum rx offset");
721 
722 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
723 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
724     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
725     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
726     "adjusted partial cksum rx offset");
727 
728 static uint64_t hwcksum_dbg_verified = 0;
729 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
730     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
731     &hwcksum_dbg_verified, "packets verified for having good checksum");
732 
733 static uint64_t hwcksum_dbg_bad_cksum = 0;
734 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
735     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
736     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
737 
738 static uint64_t hwcksum_dbg_bad_rxoff = 0;
739 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
740     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
741     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
742 
743 static uint64_t hwcksum_dbg_adjusted = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
746     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
747 
748 static uint64_t hwcksum_dbg_finalized_hdr = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
751     &hwcksum_dbg_finalized_hdr, "finalized headers");
752 
753 static uint64_t hwcksum_dbg_finalized_data = 0;
754 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
755     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
756     &hwcksum_dbg_finalized_data, "finalized payloads");
757 
758 uint32_t hwcksum_tx = 1;
759 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
760     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
761     "enable transmit hardware checksum offload");
762 
763 uint32_t hwcksum_rx = 1;
764 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
765     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
766     "enable receive hardware checksum offload");
767 
768 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
769     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
770     sysctl_tx_chain_len_stats, "S", "");
771 
772 uint32_t tx_chain_len_count = 0;
773 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
774     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
775 
776 static uint32_t threshold_notify = 1;           /* enable/disable */
777 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
778     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
779 
780 static uint32_t threshold_interval = 2;         /* in seconds */
781 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
782     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
783 
784 #if (DEVELOPMENT || DEBUG)
785 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
786 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
787     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
788 #endif /* DEVELOPMENT || DEBUG */
789 
790 struct net_api_stats net_api_stats;
791 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
792     &net_api_stats, net_api_stats, "");
793 
794 uint32_t net_wake_pkt_debug = 0;
795 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
796     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
797 
798 static void log_hexdump(void *data, size_t len);
799 
800 unsigned int net_rxpoll = 1;
801 unsigned int net_affinity = 1;
802 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
803 
804 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
805 
806 extern u_int32_t        inject_buckets;
807 
808 /* DLIL data threshold thread call */
809 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
810 
811 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)812 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
813 {
814 	/*
815 	 * update filter count and route_generation ID to let TCP
816 	 * know it should reevalute doing TSO or not
817 	 */
818 	if (filter_enable) {
819 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
820 	} else {
821 		VERIFY(ifp->if_flt_no_tso_count != 0);
822 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
823 	}
824 	routegenid_update();
825 }
826 
827 #if SKYWALK
828 
829 #if defined(XNU_TARGET_OS_OSX)
830 static bool net_check_compatible_if_filter(struct ifnet *ifp);
831 #endif /* XNU_TARGET_OS_OSX */
832 
833 /* if_attach_nx flags defined in os_skywalk_private.h */
834 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
835 unsigned int if_enable_fsw_ip_netagent =
836     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
837 unsigned int if_enable_fsw_transport_netagent =
838     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
839 
840 unsigned int if_netif_all =
841     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
842 
843 /* Configure flowswitch to use max mtu sized buffer */
844 static bool fsw_use_max_mtu_buffer = false;
845 
846 #if (DEVELOPMENT || DEBUG)
847 static int
848 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
849 {
850 #pragma unused(oidp, arg1, arg2)
851 	unsigned int new_value;
852 	int changed;
853 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
854 	    &new_value, &changed);
855 	if (error) {
856 		return error;
857 	}
858 	if (changed) {
859 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
860 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
861 			return ENOTSUP;
862 		}
863 		if_attach_nx = new_value;
864 	}
865 	return 0;
866 }
867 
868 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
869     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
870     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
871 
872 #endif /* DEVELOPMENT || DEBUG */
873 
874 static int
875 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
876 {
877 #pragma unused(oidp, arg1, arg2)
878 	unsigned int new_value;
879 	int changed;
880 	int error;
881 
882 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
883 	    sizeof(if_enable_fsw_transport_netagent),
884 	    &new_value, &changed);
885 	if (error == 0 && changed != 0) {
886 		if (new_value != 0 && new_value != 1) {
887 			/* only allow 0 or 1 */
888 			error = EINVAL;
889 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
890 			/* netagent can be enabled/disabled */
891 			if_enable_fsw_transport_netagent = new_value;
892 			if (new_value == 0) {
893 				kern_nexus_deregister_netagents();
894 			} else {
895 				kern_nexus_register_netagents();
896 			}
897 		} else {
898 			/* netagent can't be enabled */
899 			error = ENOTSUP;
900 		}
901 	}
902 	return error;
903 }
904 
905 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
906     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
907     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
908     "enable flowswitch netagent");
909 
910 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
911 
912 #include <skywalk/os_skywalk_private.h>
913 
914 boolean_t
ifnet_nx_noauto(ifnet_t ifp)915 ifnet_nx_noauto(ifnet_t ifp)
916 {
917 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
918 }
919 
920 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)921 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
922 {
923 	return ifnet_is_low_latency(ifp);
924 }
925 
926 boolean_t
ifnet_is_low_latency(ifnet_t ifp)927 ifnet_is_low_latency(ifnet_t ifp)
928 {
929 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
930 }
931 
932 boolean_t
ifnet_needs_compat(ifnet_t ifp)933 ifnet_needs_compat(ifnet_t ifp)
934 {
935 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
936 		return FALSE;
937 	}
938 #if !XNU_TARGET_OS_OSX
939 	/*
940 	 * To conserve memory, we plumb in the compat layer selectively; this
941 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
942 	 * In particular, we check for Wi-Fi Access Point.
943 	 */
944 	if (IFNET_IS_WIFI(ifp)) {
945 		/* Wi-Fi Access Point */
946 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
947 		    ifp->if_name[2] == '\0') {
948 			return if_netif_all;
949 		}
950 	}
951 #else /* XNU_TARGET_OS_OSX */
952 #pragma unused(ifp)
953 #endif /* XNU_TARGET_OS_OSX */
954 	return TRUE;
955 }
956 
957 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)958 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
959 {
960 	if (if_is_fsw_transport_netagent_enabled()) {
961 		/* check if netagent has been manually enabled for ipsec/utun */
962 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
963 			return ipsec_interface_needs_netagent(ifp);
964 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
965 			return utun_interface_needs_netagent(ifp);
966 		}
967 
968 		/* check ifnet no auto nexus override */
969 		if (ifnet_nx_noauto(ifp)) {
970 			return FALSE;
971 		}
972 
973 		/* check global if_attach_nx configuration */
974 		switch (ifp->if_family) {
975 		case IFNET_FAMILY_CELLULAR:
976 		case IFNET_FAMILY_ETHERNET:
977 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
978 				return TRUE;
979 			}
980 			break;
981 		default:
982 			break;
983 		}
984 	}
985 	return FALSE;
986 }
987 
988 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)989 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
990 {
991 #pragma unused(ifp)
992 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
993 		return TRUE;
994 	}
995 	return FALSE;
996 }
997 
998 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)999 ifnet_needs_netif_netagent(ifnet_t ifp)
1000 {
1001 #pragma unused(ifp)
1002 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1003 }
1004 
1005 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1006 dlil_detach_nexus_instance(nexus_controller_t controller,
1007     const char *func_str, uuid_t instance, uuid_t device)
1008 {
1009 	errno_t         err;
1010 
1011 	if (instance == NULL || uuid_is_null(instance)) {
1012 		return FALSE;
1013 	}
1014 
1015 	/* followed by the device port */
1016 	if (device != NULL && !uuid_is_null(device)) {
1017 		err = kern_nexus_ifdetach(controller, instance, device);
1018 		if (err != 0) {
1019 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1020 			    func_str, err);
1021 		}
1022 	}
1023 	err = kern_nexus_controller_free_provider_instance(controller,
1024 	    instance);
1025 	if (err != 0) {
1026 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1027 		    func_str, err);
1028 	}
1029 	return TRUE;
1030 }
1031 
1032 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1033 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1034     uuid_t device)
1035 {
1036 	boolean_t               detached = FALSE;
1037 	nexus_controller_t      controller = kern_nexus_shared_controller();
1038 	int                     err;
1039 
1040 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1041 	    device)) {
1042 		detached = TRUE;
1043 	}
1044 	if (provider != NULL && !uuid_is_null(provider)) {
1045 		detached = TRUE;
1046 		err = kern_nexus_controller_deregister_provider(controller,
1047 		    provider);
1048 		if (err != 0) {
1049 			DLIL_PRINTF("%s deregister_provider %d\n",
1050 			    func_str, err);
1051 		}
1052 	}
1053 	return detached;
1054 }
1055 
1056 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1057 dlil_create_provider_and_instance(nexus_controller_t controller,
1058     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1059     nexus_attr_t attr)
1060 {
1061 	uuid_t          dom_prov;
1062 	errno_t         err;
1063 	nexus_name_t    provider_name;
1064 	const char      *type_name =
1065 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1066 	struct kern_nexus_init init;
1067 
1068 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1069 	if (err != 0) {
1070 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1071 		    __func__, type_name, err);
1072 		goto failed;
1073 	}
1074 
1075 	snprintf((char *)provider_name, sizeof(provider_name),
1076 	    "com.apple.%s.%s", type_name, if_name(ifp));
1077 	err = kern_nexus_controller_register_provider(controller,
1078 	    dom_prov,
1079 	    provider_name,
1080 	    NULL,
1081 	    0,
1082 	    attr,
1083 	    provider);
1084 	if (err != 0) {
1085 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1086 		    __func__, type_name, err);
1087 		goto failed;
1088 	}
1089 	bzero(&init, sizeof(init));
1090 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1091 	err = kern_nexus_controller_alloc_provider_instance(controller,
1092 	    *provider,
1093 	    NULL, NULL,
1094 	    instance, &init);
1095 	if (err != 0) {
1096 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1097 		    __func__, type_name, err);
1098 		kern_nexus_controller_deregister_provider(controller,
1099 		    *provider);
1100 		goto failed;
1101 	}
1102 failed:
1103 	return err;
1104 }
1105 
1106 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1107 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1108 {
1109 	nexus_attr_t            attr = NULL;
1110 	nexus_controller_t      controller;
1111 	errno_t                 err;
1112 
1113 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1114 		/* it's already attached */
1115 		if (dlil_verbose) {
1116 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1117 			    __func__, if_name(ifp));
1118 			/* already attached */
1119 		}
1120 		goto failed;
1121 	}
1122 
1123 	err = kern_nexus_attr_create(&attr);
1124 	if (err != 0) {
1125 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1126 		    if_name(ifp));
1127 		goto failed;
1128 	}
1129 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1130 	VERIFY(err == 0);
1131 
1132 	controller = kern_nexus_shared_controller();
1133 
1134 	/* create the netif provider and instance */
1135 	err = dlil_create_provider_and_instance(controller,
1136 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1137 	    &netif_nx->if_nif_instance, attr);
1138 	if (err != 0) {
1139 		goto failed;
1140 	}
1141 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1142 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1143 	if (err != 0) {
1144 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1145 		    __func__, err);
1146 		/* cleanup provider and instance */
1147 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1148 		    netif_nx->if_nif_instance, NULL);
1149 		goto failed;
1150 	}
1151 	return TRUE;
1152 
1153 failed:
1154 	if (attr != NULL) {
1155 		kern_nexus_attr_destroy(attr);
1156 	}
1157 	return FALSE;
1158 }
1159 
1160 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1161 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1162 {
1163 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1164 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1165 		goto failed;
1166 	}
1167 	switch (ifp->if_type) {
1168 	case IFT_CELLULAR:
1169 	case IFT_ETHER:
1170 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1171 			/* don't auto-attach */
1172 			goto failed;
1173 		}
1174 		break;
1175 	default:
1176 		/* don't auto-attach */
1177 		goto failed;
1178 	}
1179 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1180 
1181 failed:
1182 	return FALSE;
1183 }
1184 
1185 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1186 dlil_is_native_netif_nexus(ifnet_t ifp)
1187 {
1188 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1189 }
1190 
1191 __attribute__((noinline))
1192 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1193 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1194 {
1195 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1196 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1197 }
1198 
1199 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1200 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1201 {
1202 	struct ifreq        ifr;
1203 	int                 error;
1204 
1205 	bzero(&ifr, sizeof(ifr));
1206 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1207 	if (error == 0) {
1208 		*ifdm_p = ifr.ifr_devmtu;
1209 	}
1210 	return error;
1211 }
1212 
1213 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1214 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1215     bool *use_multi_buflet, uint32_t *large_buf_size)
1216 {
1217 	struct kern_pbufpool_memory_info rx_pp_info;
1218 	struct kern_pbufpool_memory_info tx_pp_info;
1219 	uint32_t if_max_mtu = 0;
1220 	uint32_t drv_buf_size;
1221 	struct ifdevmtu ifdm;
1222 	int err;
1223 
1224 	/*
1225 	 * To perform intra-stack RX aggregation flowswitch needs to use
1226 	 * multi-buflet packet.
1227 	 */
1228 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1229 
1230 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1231 	/*
1232 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1233 	 * but the driver advertises the MAX MTU as only 9K.
1234 	 */
1235 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1236 		if_max_mtu = IP_MAXPACKET;
1237 		goto skip_mtu_ioctl;
1238 	}
1239 
1240 	/* determine max mtu */
1241 	bzero(&ifdm, sizeof(ifdm));
1242 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1243 	if (__improbable(err != 0)) {
1244 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1245 		    __func__, if_name(ifp));
1246 		/* use default flowswitch buffer size */
1247 		if_max_mtu = NX_FSW_BUFSIZE;
1248 	} else {
1249 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1250 		    ifdm.ifdm_max, ifdm.ifdm_current);
1251 		/* rdar://problem/44589731 */
1252 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1253 	}
1254 
1255 skip_mtu_ioctl:
1256 	if (if_max_mtu == 0) {
1257 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1258 		    __func__, if_name(ifp));
1259 		return EINVAL;
1260 	}
1261 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1262 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1263 		    "max bufsize(%d)\n", __func__,
1264 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1265 		return EINVAL;
1266 	}
1267 
1268 	/*
1269 	 * for skywalk native driver, consult the driver packet pool also.
1270 	 */
1271 	if (dlil_is_native_netif_nexus(ifp)) {
1272 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1273 		    &tx_pp_info);
1274 		if (err != 0) {
1275 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1276 			    __func__, if_name(ifp));
1277 			return ENXIO;
1278 		}
1279 		drv_buf_size = tx_pp_info.kpm_bufsize *
1280 		    tx_pp_info.kpm_max_frags;
1281 		if (if_max_mtu > drv_buf_size) {
1282 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1283 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1284 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1285 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1286 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1287 			return EINVAL;
1288 		}
1289 	} else {
1290 		drv_buf_size = if_max_mtu;
1291 	}
1292 
1293 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1294 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1295 		*use_multi_buflet = true;
1296 		/* default flowswitch buffer size */
1297 		*buf_size = NX_FSW_BUFSIZE;
1298 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1299 	} else {
1300 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1301 	}
1302 
1303 	/*
1304 	 * if HW TSO is enabled on a Skywalk native interface then make
1305 	 * the flowswitch default buffer be able to handle max TSO segment.
1306 	 */
1307 	uint32_t tso_v4_mtu = 0;
1308 	uint32_t tso_v6_mtu = 0;
1309 #ifdef XNU_TARGET_OS_OSX
1310 	if (dlil_is_native_netif_nexus(ifp)) {
1311 		if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1312 			tso_v4_mtu = ifp->if_tso_v4_mtu;
1313 		}
1314 		if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1315 			tso_v6_mtu = ifp->if_tso_v6_mtu;
1316 		}
1317 	}
1318 #endif /* XNU_TARGET_OS_OSX */
1319 	if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1320 		*buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1321 		ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1322 	}
1323 	if (*buf_size >= *large_buf_size) {
1324 		*large_buf_size = 0;
1325 	}
1326 	return 0;
1327 }
1328 
1329 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1330 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1331 {
1332 	nexus_attr_t            attr = NULL;
1333 	nexus_controller_t      controller;
1334 	errno_t                 err = 0;
1335 	uuid_t                  netif;
1336 	uint32_t                buf_size = 0;
1337 	uint32_t                large_buf_size = 0;
1338 	bool                    multi_buflet;
1339 
1340 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1341 	    IFNET_IS_VMNET(ifp)) {
1342 		goto failed;
1343 	}
1344 
1345 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1346 		/* not possible to attach (netif native/compat not plumbed) */
1347 		goto failed;
1348 	}
1349 
1350 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1351 		/* don't auto-attach */
1352 		goto failed;
1353 	}
1354 
1355 	/* get the netif instance from the ifp */
1356 	err = kern_nexus_get_netif_instance(ifp, netif);
1357 	if (err != 0) {
1358 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1359 		    if_name(ifp));
1360 		goto failed;
1361 	}
1362 
1363 	err = kern_nexus_attr_create(&attr);
1364 	if (err != 0) {
1365 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1366 		    if_name(ifp));
1367 		goto failed;
1368 	}
1369 
1370 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1371 	    &multi_buflet, &large_buf_size);
1372 	if (err != 0) {
1373 		goto failed;
1374 	}
1375 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1376 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1377 
1378 	/* Configure flowswitch buffer size */
1379 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1380 	VERIFY(err == 0);
1381 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1382 	    large_buf_size);
1383 	VERIFY(err == 0);
1384 
1385 	/*
1386 	 * Configure flowswitch to use super-packet (multi-buflet).
1387 	 */
1388 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1389 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1390 	VERIFY(err == 0);
1391 
1392 	/* create the flowswitch provider and instance */
1393 	controller = kern_nexus_shared_controller();
1394 	err = dlil_create_provider_and_instance(controller,
1395 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1396 	    &nexus_fsw->if_fsw_instance, attr);
1397 	if (err != 0) {
1398 		goto failed;
1399 	}
1400 
1401 	/* attach the device port */
1402 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1403 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1404 	if (err != 0) {
1405 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1406 		    __func__, err, if_name(ifp));
1407 		/* cleanup provider and instance */
1408 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1409 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1410 		goto failed;
1411 	}
1412 	return TRUE;
1413 
1414 failed:
1415 	if (err != 0) {
1416 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1417 		    __func__, if_name(ifp), err);
1418 	} else {
1419 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1420 		    __func__, if_name(ifp));
1421 	}
1422 	if (attr != NULL) {
1423 		kern_nexus_attr_destroy(attr);
1424 	}
1425 	return FALSE;
1426 }
1427 
1428 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1429 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1430 {
1431 	boolean_t               attached;
1432 	if_nexus_flowswitch     nexus_fsw;
1433 
1434 #if (DEVELOPMENT || DEBUG)
1435 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1436 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1437 		return FALSE;
1438 	}
1439 #endif /* (DEVELOPMENT || DEBUG) */
1440 
1441 	/*
1442 	 * flowswitch attachment is not supported for interface using the
1443 	 * legacy model (IFNET_INIT_LEGACY)
1444 	 */
1445 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1446 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1447 		    if_name(ifp));
1448 		return FALSE;
1449 	}
1450 
1451 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1452 		/* it's already attached */
1453 		return FALSE;
1454 	}
1455 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1456 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1457 	if (attached) {
1458 		ifnet_lock_exclusive(ifp);
1459 		if (!IF_FULLY_ATTACHED(ifp)) {
1460 			/* interface is going away */
1461 			attached = FALSE;
1462 		} else {
1463 			ifp->if_nx_flowswitch = nexus_fsw;
1464 		}
1465 		ifnet_lock_done(ifp);
1466 		if (!attached) {
1467 			/* clean up flowswitch nexus */
1468 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1469 		}
1470 	}
1471 	return attached;
1472 }
1473 
1474 __attribute__((noinline))
1475 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1476 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1477 {
1478 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1479 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1480 }
1481 
1482 __attribute__((noinline))
1483 static void
dlil_netif_detach_notify(ifnet_t ifp)1484 dlil_netif_detach_notify(ifnet_t ifp)
1485 {
1486 	void (*detach_notify)(struct nexus_netif_adapter *);
1487 
1488 	/*
1489 	 * This is only needed for low latency interfaces for now.
1490 	 */
1491 	if (!ifnet_is_low_latency(ifp)) {
1492 		return;
1493 	}
1494 	detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1495 	if (detach_notify != NULL) {
1496 		(*detach_notify)(ifp->if_na);
1497 	} else {
1498 		DLIL_PRINTF("%s: %s has no detach notify calback\n",
1499 		    __func__, if_name(ifp));
1500 	}
1501 }
1502 
1503 __attribute__((noinline))
1504 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1505 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1506 {
1507 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1508 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1509 
1510 	ifnet_datamov_suspend_and_drain(ifp);
1511 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1512 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1513 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1514 		dlil_detach_flowswitch_nexus(nx_fsw);
1515 		bzero(nx_fsw, sizeof(*nx_fsw));
1516 	} else {
1517 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1518 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1519 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1520 	}
1521 
1522 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1523 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1524 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1525 		dlil_detach_netif_nexus(nx_netif);
1526 		bzero(nx_netif, sizeof(*nx_netif));
1527 	} else {
1528 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1529 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1530 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1531 	}
1532 	ifnet_datamov_resume(ifp);
1533 }
1534 
1535 boolean_t
ifnet_add_netagent(ifnet_t ifp)1536 ifnet_add_netagent(ifnet_t ifp)
1537 {
1538 	int     error;
1539 
1540 	error = kern_nexus_interface_add_netagent(ifp);
1541 	os_log(OS_LOG_DEFAULT,
1542 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1543 	    ifp->if_xname, error);
1544 	return error == 0;
1545 }
1546 
1547 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1548 ifnet_remove_netagent(ifnet_t ifp)
1549 {
1550 	int     error;
1551 
1552 	error = kern_nexus_interface_remove_netagent(ifp);
1553 	os_log(OS_LOG_DEFAULT,
1554 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1555 	    ifp->if_xname, error);
1556 	return error == 0;
1557 }
1558 
1559 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1560 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1561 {
1562 	if (!IF_FULLY_ATTACHED(ifp)) {
1563 		return FALSE;
1564 	}
1565 	return dlil_attach_flowswitch_nexus(ifp);
1566 }
1567 
1568 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1569 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1570 {
1571 	if_nexus_flowswitch     nexus_fsw;
1572 
1573 	ifnet_lock_exclusive(ifp);
1574 	nexus_fsw = ifp->if_nx_flowswitch;
1575 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1576 	ifnet_lock_done(ifp);
1577 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1578 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1579 }
1580 
1581 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1582 ifnet_attach_netif_nexus(ifnet_t ifp)
1583 {
1584 	boolean_t       nexus_attached;
1585 	if_nexus_netif  nexus_netif;
1586 
1587 	if (!IF_FULLY_ATTACHED(ifp)) {
1588 		return FALSE;
1589 	}
1590 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1591 	if (nexus_attached) {
1592 		ifnet_lock_exclusive(ifp);
1593 		ifp->if_nx_netif = nexus_netif;
1594 		ifnet_lock_done(ifp);
1595 	}
1596 	return nexus_attached;
1597 }
1598 
1599 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1600 ifnet_detach_netif_nexus(ifnet_t ifp)
1601 {
1602 	if_nexus_netif  nexus_netif;
1603 
1604 	ifnet_lock_exclusive(ifp);
1605 	nexus_netif = ifp->if_nx_netif;
1606 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1607 	ifnet_lock_done(ifp);
1608 
1609 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1610 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1611 }
1612 
1613 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1614 ifnet_attach_native_flowswitch(ifnet_t ifp)
1615 {
1616 	if (!dlil_is_native_netif_nexus(ifp)) {
1617 		/* not a native netif */
1618 		return;
1619 	}
1620 	ifnet_attach_flowswitch_nexus(ifp);
1621 }
1622 
1623 #endif /* SKYWALK */
1624 
1625 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1626 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1627 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1628 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1629 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1630 	/* NOTREACHED */                                        \
1631 	}                                                               \
1632 }
1633 
1634 #define DLIL_EWMA(old, new, decay) do {                                 \
1635 	u_int32_t _avg;                                                 \
1636 	if ((_avg = (old)) > 0)                                         \
1637 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1638 	else                                                            \
1639 	        _avg = (new);                                           \
1640 	(old) = _avg;                                                   \
1641 } while (0)
1642 
1643 #define MBPS    (1ULL * 1000 * 1000)
1644 #define GBPS    (MBPS * 1000)
1645 
1646 struct rxpoll_time_tbl {
1647 	u_int64_t       speed;          /* downlink speed */
1648 	u_int32_t       plowat;         /* packets low watermark */
1649 	u_int32_t       phiwat;         /* packets high watermark */
1650 	u_int32_t       blowat;         /* bytes low watermark */
1651 	u_int32_t       bhiwat;         /* bytes high watermark */
1652 };
1653 
1654 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1655 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1656 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1657 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1658 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1659 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1660 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1661 };
1662 
1663 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1664     &dlil_lck_attributes);
1665 static uint32_t dlil_pending_thread_cnt = 0;
1666 
1667 static void
dlil_incr_pending_thread_count(void)1668 dlil_incr_pending_thread_count(void)
1669 {
1670 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1671 	lck_mtx_lock(&dlil_thread_sync_lock);
1672 	dlil_pending_thread_cnt++;
1673 	lck_mtx_unlock(&dlil_thread_sync_lock);
1674 }
1675 
1676 static void
dlil_decr_pending_thread_count(void)1677 dlil_decr_pending_thread_count(void)
1678 {
1679 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1680 	lck_mtx_lock(&dlil_thread_sync_lock);
1681 	VERIFY(dlil_pending_thread_cnt > 0);
1682 	dlil_pending_thread_cnt--;
1683 	if (dlil_pending_thread_cnt == 0) {
1684 		wakeup(&dlil_pending_thread_cnt);
1685 	}
1686 	lck_mtx_unlock(&dlil_thread_sync_lock);
1687 }
1688 
1689 int
proto_hash_value(u_int32_t protocol_family)1690 proto_hash_value(u_int32_t protocol_family)
1691 {
1692 	/*
1693 	 * dlil_proto_unplumb_all() depends on the mapping between
1694 	 * the hash bucket index and the protocol family defined
1695 	 * here; future changes must be applied there as well.
1696 	 */
1697 	switch (protocol_family) {
1698 	case PF_INET:
1699 		return 0;
1700 	case PF_INET6:
1701 		return 1;
1702 	case PF_VLAN:
1703 		return 2;
1704 	case PF_UNSPEC:
1705 	default:
1706 		return 3;
1707 	}
1708 }
1709 
1710 /*
1711  * Caller must already be holding ifnet lock.
1712  */
1713 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1714 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1715 {
1716 	struct if_proto *proto = NULL;
1717 	u_int32_t i = proto_hash_value(protocol_family);
1718 
1719 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1720 
1721 	if (ifp->if_proto_hash != NULL) {
1722 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1723 	}
1724 
1725 	while (proto != NULL && proto->protocol_family != protocol_family) {
1726 		proto = SLIST_NEXT(proto, next_hash);
1727 	}
1728 
1729 	if (proto != NULL) {
1730 		if_proto_ref(proto);
1731 	}
1732 
1733 	return proto;
1734 }
1735 
1736 static void
if_proto_ref(struct if_proto * proto)1737 if_proto_ref(struct if_proto *proto)
1738 {
1739 	atomic_add_32(&proto->refcount, 1);
1740 }
1741 
1742 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1743 
1744 static void
if_proto_free(struct if_proto * proto)1745 if_proto_free(struct if_proto *proto)
1746 {
1747 	u_int32_t oldval;
1748 	struct ifnet *ifp = proto->ifp;
1749 	u_int32_t proto_family = proto->protocol_family;
1750 	struct kev_dl_proto_data ev_pr_data;
1751 
1752 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1753 	if (oldval > 1) {
1754 		return;
1755 	}
1756 
1757 	if (proto->proto_kpi == kProtoKPI_v1) {
1758 		if (proto->kpi.v1.detached) {
1759 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1760 		}
1761 	}
1762 	if (proto->proto_kpi == kProtoKPI_v2) {
1763 		if (proto->kpi.v2.detached) {
1764 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1765 		}
1766 	}
1767 
1768 	/*
1769 	 * Cleanup routes that may still be in the routing table for that
1770 	 * interface/protocol pair.
1771 	 */
1772 	if_rtproto_del(ifp, proto_family);
1773 
1774 	ifnet_lock_shared(ifp);
1775 
1776 	/* No more reference on this, protocol must have been detached */
1777 	VERIFY(proto->detached);
1778 
1779 	/*
1780 	 * The reserved field carries the number of protocol still attached
1781 	 * (subject to change)
1782 	 */
1783 	ev_pr_data.proto_family = proto_family;
1784 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1785 
1786 	ifnet_lock_done(ifp);
1787 
1788 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1789 	    (struct net_event_data *)&ev_pr_data,
1790 	    sizeof(struct kev_dl_proto_data), FALSE);
1791 
1792 	if (ev_pr_data.proto_remaining_count == 0) {
1793 		/*
1794 		 * The protocol count has gone to zero, mark the interface down.
1795 		 * This used to be done by configd.KernelEventMonitor, but that
1796 		 * is inherently prone to races (rdar://problem/30810208).
1797 		 */
1798 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1799 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1800 		dlil_post_sifflags_msg(ifp);
1801 	}
1802 
1803 	zfree(dlif_proto_zone, proto);
1804 }
1805 
1806 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1807 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1808 {
1809 #if !MACH_ASSERT
1810 #pragma unused(ifp)
1811 #endif
1812 	unsigned int type = 0;
1813 	int ass = 1;
1814 
1815 	switch (what) {
1816 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1817 		type = LCK_RW_ASSERT_EXCLUSIVE;
1818 		break;
1819 
1820 	case IFNET_LCK_ASSERT_SHARED:
1821 		type = LCK_RW_ASSERT_SHARED;
1822 		break;
1823 
1824 	case IFNET_LCK_ASSERT_OWNED:
1825 		type = LCK_RW_ASSERT_HELD;
1826 		break;
1827 
1828 	case IFNET_LCK_ASSERT_NOTOWNED:
1829 		/* nothing to do here for RW lock; bypass assert */
1830 		ass = 0;
1831 		break;
1832 
1833 	default:
1834 		panic("bad ifnet assert type: %d", what);
1835 		/* NOTREACHED */
1836 	}
1837 	if (ass) {
1838 		LCK_RW_ASSERT(&ifp->if_lock, type);
1839 	}
1840 }
1841 
1842 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1843 ifnet_lock_shared(struct ifnet *ifp)
1844 {
1845 	lck_rw_lock_shared(&ifp->if_lock);
1846 }
1847 
1848 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1849 ifnet_lock_exclusive(struct ifnet *ifp)
1850 {
1851 	lck_rw_lock_exclusive(&ifp->if_lock);
1852 }
1853 
1854 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1855 ifnet_lock_done(struct ifnet *ifp)
1856 {
1857 	lck_rw_done(&ifp->if_lock);
1858 }
1859 
1860 #if INET
1861 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1862 if_inetdata_lock_shared(struct ifnet *ifp)
1863 {
1864 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1865 }
1866 
1867 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1868 if_inetdata_lock_exclusive(struct ifnet *ifp)
1869 {
1870 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1871 }
1872 
1873 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1874 if_inetdata_lock_done(struct ifnet *ifp)
1875 {
1876 	lck_rw_done(&ifp->if_inetdata_lock);
1877 }
1878 #endif
1879 
1880 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1881 if_inet6data_lock_shared(struct ifnet *ifp)
1882 {
1883 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1884 }
1885 
1886 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1887 if_inet6data_lock_exclusive(struct ifnet *ifp)
1888 {
1889 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1890 }
1891 
1892 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1893 if_inet6data_lock_done(struct ifnet *ifp)
1894 {
1895 	lck_rw_done(&ifp->if_inet6data_lock);
1896 }
1897 
1898 __private_extern__ void
ifnet_head_lock_shared(void)1899 ifnet_head_lock_shared(void)
1900 {
1901 	lck_rw_lock_shared(&ifnet_head_lock);
1902 }
1903 
1904 __private_extern__ void
ifnet_head_lock_exclusive(void)1905 ifnet_head_lock_exclusive(void)
1906 {
1907 	lck_rw_lock_exclusive(&ifnet_head_lock);
1908 }
1909 
1910 __private_extern__ void
ifnet_head_done(void)1911 ifnet_head_done(void)
1912 {
1913 	lck_rw_done(&ifnet_head_lock);
1914 }
1915 
1916 __private_extern__ void
ifnet_head_assert_exclusive(void)1917 ifnet_head_assert_exclusive(void)
1918 {
1919 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1920 }
1921 
1922 /*
1923  * dlil_ifp_protolist
1924  * - get the list of protocols attached to the interface, or just the number
1925  *   of attached protocols
1926  * - if the number returned is greater than 'list_count', truncation occurred
1927  *
1928  * Note:
1929  * - caller must already be holding ifnet lock.
1930  */
1931 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1932 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1933     u_int32_t list_count)
1934 {
1935 	u_int32_t       count = 0;
1936 	int             i;
1937 
1938 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1939 
1940 	if (ifp->if_proto_hash == NULL) {
1941 		goto done;
1942 	}
1943 
1944 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1945 		struct if_proto *proto;
1946 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1947 			if (list != NULL && count < list_count) {
1948 				list[count] = proto->protocol_family;
1949 			}
1950 			count++;
1951 		}
1952 	}
1953 done:
1954 	return count;
1955 }
1956 
1957 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1958 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1959 {
1960 	ifnet_lock_shared(ifp);
1961 	count = dlil_ifp_protolist(ifp, protolist, count);
1962 	ifnet_lock_done(ifp);
1963 	return count;
1964 }
1965 
1966 __private_extern__ void
if_free_protolist(u_int32_t * list)1967 if_free_protolist(u_int32_t *list)
1968 {
1969 	kfree_data_addr(list);
1970 }
1971 
1972 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1973 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1974     u_int32_t event_code, struct net_event_data *event_data,
1975     u_int32_t event_data_len, boolean_t suppress_generation)
1976 {
1977 	struct net_event_data ev_data;
1978 	struct kev_msg ev_msg;
1979 
1980 	bzero(&ev_msg, sizeof(ev_msg));
1981 	bzero(&ev_data, sizeof(ev_data));
1982 	/*
1983 	 * a net event always starts with a net_event_data structure
1984 	 * but the caller can generate a simple net event or
1985 	 * provide a longer event structure to post
1986 	 */
1987 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1988 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1989 	ev_msg.kev_subclass     = event_subclass;
1990 	ev_msg.event_code       = event_code;
1991 
1992 	if (event_data == NULL) {
1993 		event_data = &ev_data;
1994 		event_data_len = sizeof(struct net_event_data);
1995 	}
1996 
1997 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1998 	event_data->if_family = ifp->if_family;
1999 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2000 
2001 	ev_msg.dv[0].data_length = event_data_len;
2002 	ev_msg.dv[0].data_ptr    = event_data;
2003 	ev_msg.dv[1].data_length = 0;
2004 
2005 	bool update_generation = true;
2006 	if (event_subclass == KEV_DL_SUBCLASS) {
2007 		/* Don't update interface generation for frequent link quality and state changes  */
2008 		switch (event_code) {
2009 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2010 		case KEV_DL_RRC_STATE_CHANGED:
2011 		case KEV_DL_PRIMARY_ELECTED:
2012 			update_generation = false;
2013 			break;
2014 		default:
2015 			break;
2016 		}
2017 	}
2018 
2019 	/*
2020 	 * Some events that update generation counts might
2021 	 * want to suppress generation count.
2022 	 * One example is node presence/absence where we still
2023 	 * issue kernel event for the invocation but want to avoid
2024 	 * expensive operation of updating generation which triggers
2025 	 * NECP client updates.
2026 	 */
2027 	if (suppress_generation) {
2028 		update_generation = false;
2029 	}
2030 
2031 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2032 }
2033 
2034 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2035 dlil_alloc_local_stats(struct ifnet *ifp)
2036 {
2037 	int ret = EINVAL;
2038 	void *buf, *base, **pbuf;
2039 
2040 	if (ifp == NULL) {
2041 		goto end;
2042 	}
2043 
2044 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2045 		/* allocate tcpstat_local structure */
2046 		buf = zalloc_flags(dlif_tcpstat_zone,
2047 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2048 
2049 		/* Get the 64-bit aligned base address for this object */
2050 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2051 		    sizeof(u_int64_t));
2052 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2053 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2054 
2055 		/*
2056 		 * Wind back a pointer size from the aligned base and
2057 		 * save the original address so we can free it later.
2058 		 */
2059 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2060 		*pbuf = buf;
2061 		ifp->if_tcp_stat = base;
2062 
2063 		/* allocate udpstat_local structure */
2064 		buf = zalloc_flags(dlif_udpstat_zone,
2065 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2066 
2067 		/* Get the 64-bit aligned base address for this object */
2068 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2069 		    sizeof(u_int64_t));
2070 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2071 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2072 
2073 		/*
2074 		 * Wind back a pointer size from the aligned base and
2075 		 * save the original address so we can free it later.
2076 		 */
2077 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2078 		*pbuf = buf;
2079 		ifp->if_udp_stat = base;
2080 
2081 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2082 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2083 
2084 		ret = 0;
2085 	}
2086 
2087 	if (ifp->if_ipv4_stat == NULL) {
2088 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2089 	}
2090 
2091 	if (ifp->if_ipv6_stat == NULL) {
2092 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2093 	}
2094 end:
2095 	if (ifp != NULL && ret != 0) {
2096 		if (ifp->if_tcp_stat != NULL) {
2097 			pbuf = (void **)
2098 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2099 			zfree(dlif_tcpstat_zone, *pbuf);
2100 			ifp->if_tcp_stat = NULL;
2101 		}
2102 		if (ifp->if_udp_stat != NULL) {
2103 			pbuf = (void **)
2104 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2105 			zfree(dlif_udpstat_zone, *pbuf);
2106 			ifp->if_udp_stat = NULL;
2107 		}
2108 		/* The macro kfree_type sets the passed pointer to NULL */
2109 		if (ifp->if_ipv4_stat != NULL) {
2110 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2111 		}
2112 		if (ifp->if_ipv6_stat != NULL) {
2113 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2114 		}
2115 	}
2116 
2117 	return ret;
2118 }
2119 
2120 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2121 dlil_reset_rxpoll_params(ifnet_t ifp)
2122 {
2123 	ASSERT(ifp != NULL);
2124 	ifnet_set_poll_cycle(ifp, NULL);
2125 	ifp->if_poll_update = 0;
2126 	ifp->if_poll_flags = 0;
2127 	ifp->if_poll_req = 0;
2128 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2129 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2130 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2131 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2132 	net_timerclear(&ifp->if_poll_mode_holdtime);
2133 	net_timerclear(&ifp->if_poll_mode_lasttime);
2134 	net_timerclear(&ifp->if_poll_sample_holdtime);
2135 	net_timerclear(&ifp->if_poll_sample_lasttime);
2136 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2137 }
2138 
2139 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2140 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2141     thread_continue_t *thfunc)
2142 {
2143 	boolean_t dlil_rxpoll_input;
2144 	thread_continue_t func = NULL;
2145 	u_int32_t limit;
2146 	int error = 0;
2147 
2148 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2149 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2150 
2151 	/* default strategy utilizes the DLIL worker thread */
2152 	inp->dlth_strategy = dlil_input_async;
2153 
2154 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2155 	if (ifp == NULL) {
2156 		/*
2157 		 * Main input thread only.
2158 		 */
2159 		func = dlil_main_input_thread_func;
2160 		VERIFY(inp == dlil_main_input_thread);
2161 		(void) strlcat(inp->dlth_name,
2162 		    "main_input", DLIL_THREADNAME_LEN);
2163 	} else if (dlil_rxpoll_input) {
2164 		/*
2165 		 * Legacy (non-netif) hybrid polling.
2166 		 */
2167 		func = dlil_rxpoll_input_thread_func;
2168 		VERIFY(inp != dlil_main_input_thread);
2169 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2170 		    "%s_input_poll", if_name(ifp));
2171 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2172 		/*
2173 		 * Asynchronous strategy.
2174 		 */
2175 		func = dlil_input_thread_func;
2176 		VERIFY(inp != dlil_main_input_thread);
2177 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2178 		    "%s_input", if_name(ifp));
2179 	} else {
2180 		/*
2181 		 * Synchronous strategy if there's a netif below and
2182 		 * the device isn't capable of hybrid polling.
2183 		 */
2184 		ASSERT(func == NULL);
2185 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2186 		VERIFY(inp != dlil_main_input_thread);
2187 		ASSERT(!inp->dlth_affinity);
2188 		inp->dlth_strategy = dlil_input_sync;
2189 	}
2190 	VERIFY(inp->dlth_thread == THREAD_NULL);
2191 
2192 	/* let caller know */
2193 	if (thfunc != NULL) {
2194 		*thfunc = func;
2195 	}
2196 
2197 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2198 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2199 
2200 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2201 	/*
2202 	 * For interfaces that support opportunistic polling, set the
2203 	 * low and high watermarks for outstanding inbound packets/bytes.
2204 	 * Also define freeze times for transitioning between modes
2205 	 * and updating the average.
2206 	 */
2207 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2208 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2209 		if (ifp->if_xflags & IFXF_LEGACY) {
2210 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2211 		}
2212 	} else {
2213 		limit = (u_int32_t)-1;
2214 	}
2215 
2216 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2217 	if (inp == dlil_main_input_thread) {
2218 		struct dlil_main_threading_info *inpm =
2219 		    (struct dlil_main_threading_info *)inp;
2220 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2221 	}
2222 
2223 	if (func == NULL) {
2224 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2225 		ASSERT(error == 0);
2226 		error = ENODEV;
2227 		goto done;
2228 	}
2229 
2230 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2231 	if (error == KERN_SUCCESS) {
2232 		thread_precedence_policy_data_t info;
2233 		__unused kern_return_t kret;
2234 
2235 		bzero(&info, sizeof(info));
2236 		info.importance = 0;
2237 		kret = thread_policy_set(inp->dlth_thread,
2238 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2239 		    THREAD_PRECEDENCE_POLICY_COUNT);
2240 		ASSERT(kret == KERN_SUCCESS);
2241 		/*
2242 		 * We create an affinity set so that the matching workloop
2243 		 * thread or the starter thread (for loopback) can be
2244 		 * scheduled on the same processor set as the input thread.
2245 		 */
2246 		if (net_affinity) {
2247 			struct thread *tp = inp->dlth_thread;
2248 			u_int32_t tag;
2249 			/*
2250 			 * Randomize to reduce the probability
2251 			 * of affinity tag namespace collision.
2252 			 */
2253 			read_frandom(&tag, sizeof(tag));
2254 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2255 				thread_reference(tp);
2256 				inp->dlth_affinity_tag = tag;
2257 				inp->dlth_affinity = TRUE;
2258 			}
2259 		}
2260 	} else if (inp == dlil_main_input_thread) {
2261 		panic_plain("%s: couldn't create main input thread", __func__);
2262 		/* NOTREACHED */
2263 	} else {
2264 		panic_plain("%s: couldn't create %s input thread", __func__,
2265 		    if_name(ifp));
2266 		/* NOTREACHED */
2267 	}
2268 	OSAddAtomic(1, &cur_dlil_input_threads);
2269 
2270 done:
2271 	return error;
2272 }
2273 
2274 #if TEST_INPUT_THREAD_TERMINATION
2275 static int
2276 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2277 {
2278 #pragma unused(arg1, arg2)
2279 	uint32_t i;
2280 	int err;
2281 
2282 	i = if_input_thread_termination_spin;
2283 
2284 	err = sysctl_handle_int(oidp, &i, 0, req);
2285 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2286 		return err;
2287 	}
2288 
2289 	if (net_rxpoll == 0) {
2290 		return ENXIO;
2291 	}
2292 
2293 	if_input_thread_termination_spin = i;
2294 	return err;
2295 }
2296 #endif /* TEST_INPUT_THREAD_TERMINATION */
2297 
2298 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2299 dlil_clean_threading_info(struct dlil_threading_info *inp)
2300 {
2301 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2302 	lck_grp_free(inp->dlth_lock_grp);
2303 	inp->dlth_lock_grp = NULL;
2304 
2305 	inp->dlth_flags = 0;
2306 	inp->dlth_wtot = 0;
2307 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2308 	inp->dlth_ifp = NULL;
2309 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2310 	qlimit(&inp->dlth_pkts) = 0;
2311 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2312 
2313 	VERIFY(!inp->dlth_affinity);
2314 	inp->dlth_thread = THREAD_NULL;
2315 	inp->dlth_strategy = NULL;
2316 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2317 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2318 	VERIFY(inp->dlth_affinity_tag == 0);
2319 #if IFNET_INPUT_SANITY_CHK
2320 	inp->dlth_pkts_cnt = 0;
2321 #endif /* IFNET_INPUT_SANITY_CHK */
2322 }
2323 
2324 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2325 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2326 {
2327 	struct ifnet *ifp = inp->dlth_ifp;
2328 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2329 
2330 	VERIFY(current_thread() == inp->dlth_thread);
2331 	VERIFY(inp != dlil_main_input_thread);
2332 
2333 	OSAddAtomic(-1, &cur_dlil_input_threads);
2334 
2335 #if TEST_INPUT_THREAD_TERMINATION
2336 	{ /* do something useless that won't get optimized away */
2337 		uint32_t        v = 1;
2338 		for (uint32_t i = 0;
2339 		    i < if_input_thread_termination_spin;
2340 		    i++) {
2341 			v = (i + 1) * v;
2342 		}
2343 		DLIL_PRINTF("the value is %d\n", v);
2344 	}
2345 #endif /* TEST_INPUT_THREAD_TERMINATION */
2346 
2347 	lck_mtx_lock_spin(&inp->dlth_lock);
2348 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2349 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2350 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2351 	wakeup_one((caddr_t)&inp->dlth_flags);
2352 	lck_mtx_unlock(&inp->dlth_lock);
2353 
2354 	/* free up pending packets */
2355 	if (pkt.cp_mbuf != NULL) {
2356 		mbuf_freem_list(pkt.cp_mbuf);
2357 	}
2358 
2359 	/* for the extra refcnt from kernel_thread_start() */
2360 	thread_deallocate(current_thread());
2361 
2362 	if (dlil_verbose) {
2363 		DLIL_PRINTF("%s: input thread terminated\n",
2364 		    if_name(ifp));
2365 	}
2366 
2367 	/* this is the end */
2368 	thread_terminate(current_thread());
2369 	/* NOTREACHED */
2370 }
2371 
2372 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2373 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2374 {
2375 	thread_affinity_policy_data_t policy;
2376 
2377 	bzero(&policy, sizeof(policy));
2378 	policy.affinity_tag = tag;
2379 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2380 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2381 }
2382 
2383 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2384 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2385 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2386     enum net_filter_event_subsystems state)
2387 {
2388 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2389 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2390 		if_enable_fsw_transport_netagent = 1;
2391 	} else {
2392 		if_enable_fsw_transport_netagent = 0;
2393 	}
2394 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2395 		kern_nexus_update_netagents();
2396 	} else if (!if_enable_fsw_transport_netagent) {
2397 		necp_update_all_clients();
2398 	}
2399 }
2400 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2401 
2402 void
dlil_init(void)2403 dlil_init(void)
2404 {
2405 	thread_t thread = THREAD_NULL;
2406 
2407 	/*
2408 	 * The following fields must be 64-bit aligned for atomic operations.
2409 	 */
2410 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2411 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2412 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2413 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2414 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2415 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2416 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2417 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2418 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2419 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2420 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2421 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2422 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2423 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2424 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2425 
2426 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2427 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2428 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2429 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2430 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2431 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2432 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2433 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2434 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2435 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2436 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2437 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2438 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2439 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2440 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2441 
2442 	/*
2443 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2444 	 */
2445 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2446 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2447 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2448 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2449 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2450 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2451 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2452 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2453 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2454 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2455 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2456 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2457 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2458 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2459 
2460 	/*
2461 	 * ... as well as the mbuf checksum flags counterparts.
2462 	 */
2463 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2464 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2465 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2466 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2467 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2468 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2469 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2470 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2471 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2472 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2473 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2474 
2475 	/*
2476 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2477 	 */
2478 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2479 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2480 
2481 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2482 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2483 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2484 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2485 
2486 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2487 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2488 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2489 
2490 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2491 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2492 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2493 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2494 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2495 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2496 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2497 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2498 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2499 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2500 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2501 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2502 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2503 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2504 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2505 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2506 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2507 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2508 
2509 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2510 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2511 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2512 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2513 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2514 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2515 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2516 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2517 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2518 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2519 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2520 
2521 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2522 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2523 
2524 	PE_parse_boot_argn("net_affinity", &net_affinity,
2525 	    sizeof(net_affinity));
2526 
2527 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2528 
2529 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2530 
2531 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2532 
2533 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2534 
2535 	VERIFY(dlil_pending_thread_cnt == 0);
2536 #if SKYWALK
2537 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2538 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2539 	boolean_t enable_fsw_netagent =
2540 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2541 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2542 
2543 	/*
2544 	 * Check the device tree to see if Skywalk netagent has been explicitly
2545 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2546 	 * Note that the property is a 0-length key, and so checking for the
2547 	 * presence itself is enough (no need to check for the actual value of
2548 	 * the retrieved variable.)
2549 	 */
2550 	pe_enable_fsw_transport_netagent =
2551 	    PE_get_default("kern.skywalk_netagent_enable",
2552 	    &pe_enable_fsw_transport_netagent,
2553 	    sizeof(pe_enable_fsw_transport_netagent));
2554 	pe_disable_fsw_transport_netagent =
2555 	    PE_get_default("kern.skywalk_netagent_disable",
2556 	    &pe_disable_fsw_transport_netagent,
2557 	    sizeof(pe_disable_fsw_transport_netagent));
2558 
2559 	/*
2560 	 * These two are mutually exclusive, i.e. they both can be absent,
2561 	 * but only one can be present at a time, and so we assert to make
2562 	 * sure it is correct.
2563 	 */
2564 	VERIFY((!pe_enable_fsw_transport_netagent &&
2565 	    !pe_disable_fsw_transport_netagent) ||
2566 	    (pe_enable_fsw_transport_netagent ^
2567 	    pe_disable_fsw_transport_netagent));
2568 
2569 	if (pe_enable_fsw_transport_netagent) {
2570 		kprintf("SK: netagent is enabled via an override for "
2571 		    "this platform\n");
2572 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2573 	} else if (pe_disable_fsw_transport_netagent) {
2574 		kprintf("SK: netagent is disabled via an override for "
2575 		    "this platform\n");
2576 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2577 	} else {
2578 		kprintf("SK: netagent is %s by default for this platform\n",
2579 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2580 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2581 	}
2582 
2583 	/*
2584 	 * Now see if there's a boot-arg override.
2585 	 */
2586 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2587 	    sizeof(if_attach_nx));
2588 	if_enable_fsw_transport_netagent =
2589 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2590 
2591 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2592 
2593 	if (pe_disable_fsw_transport_netagent &&
2594 	    if_enable_fsw_transport_netagent) {
2595 		kprintf("SK: netagent is force-enabled\n");
2596 	} else if (!pe_disable_fsw_transport_netagent &&
2597 	    !if_enable_fsw_transport_netagent) {
2598 		kprintf("SK: netagent is force-disabled\n");
2599 	}
2600 #ifdef XNU_TARGET_OS_OSX
2601 	if (if_enable_fsw_transport_netagent) {
2602 		net_filter_event_register(dlil_filter_event);
2603 	}
2604 #endif /* XNU_TARGET_OS_OSX */
2605 
2606 #if (DEVELOPMENT || DEBUG)
2607 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2608 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2609 #endif /* (DEVELOPMENT || DEBUG) */
2610 
2611 #endif /* SKYWALK */
2612 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2613 	    sizeof(struct dlil_ifnet_dbg);
2614 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2615 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2616 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2617 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2618 
2619 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2620 	/* Enforce 64-bit alignment for tcpstat_local structure */
2621 	dlif_tcpstat_bufsize =
2622 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2623 	dlif_tcpstat_bufsize = (uint32_t)
2624 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2625 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2626 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2627 
2628 	dlif_udpstat_size = sizeof(struct udpstat_local);
2629 	/* Enforce 64-bit alignment for udpstat_local structure */
2630 	dlif_udpstat_bufsize =
2631 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2632 	dlif_udpstat_bufsize = (uint32_t)
2633 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2634 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2635 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2636 
2637 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2638 
2639 	TAILQ_INIT(&dlil_ifnet_head);
2640 	TAILQ_INIT(&ifnet_head);
2641 	TAILQ_INIT(&ifnet_detaching_head);
2642 	TAILQ_INIT(&ifnet_ordered_head);
2643 
2644 	/* Initialize interface address subsystem */
2645 	ifa_init();
2646 
2647 #if PF
2648 	/* Initialize the packet filter */
2649 	pfinit();
2650 #endif /* PF */
2651 
2652 	/* Initialize queue algorithms */
2653 	classq_init();
2654 
2655 	/* Initialize packet schedulers */
2656 	pktsched_init();
2657 
2658 	/* Initialize flow advisory subsystem */
2659 	flowadv_init();
2660 
2661 	/* Initialize the pktap virtual interface */
2662 	pktap_init();
2663 
2664 	/* Initialize the service class to dscp map */
2665 	net_qos_map_init();
2666 
2667 	/* Initialize the interface low power mode event handler */
2668 	if_low_power_evhdlr_init();
2669 
2670 	/* Initialize the interface offload port list subsystem */
2671 	if_ports_used_init();
2672 
2673 #if DEBUG || DEVELOPMENT
2674 	/* Run self-tests */
2675 	dlil_verify_sum16();
2676 #endif /* DEBUG || DEVELOPMENT */
2677 
2678 	/*
2679 	 * Create and start up the main DLIL input thread and the interface
2680 	 * detacher threads once everything is initialized.
2681 	 */
2682 	dlil_incr_pending_thread_count();
2683 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2684 
2685 	/*
2686 	 * Create ifnet detacher thread.
2687 	 * When an interface gets detached, part of the detach processing
2688 	 * is delayed. The interface is added to delayed detach list
2689 	 * and this thread is woken up to call ifnet_detach_final
2690 	 * on these interfaces.
2691 	 */
2692 	dlil_incr_pending_thread_count();
2693 	if (kernel_thread_start(ifnet_detacher_thread_func,
2694 	    NULL, &thread) != KERN_SUCCESS) {
2695 		panic_plain("%s: couldn't create detacher thread", __func__);
2696 		/* NOTREACHED */
2697 	}
2698 	thread_deallocate(thread);
2699 
2700 	/*
2701 	 * Wait for the created kernel threads for dlil to get
2702 	 * scheduled and run at least once before we proceed
2703 	 */
2704 	lck_mtx_lock(&dlil_thread_sync_lock);
2705 	while (dlil_pending_thread_cnt != 0) {
2706 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2707 		    "threads to get scheduled at least once.\n", __func__);
2708 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2709 		    (PZERO - 1), __func__, NULL);
2710 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2711 	}
2712 	lck_mtx_unlock(&dlil_thread_sync_lock);
2713 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2714 	    "scheduled at least once. Proceeding.\n", __func__);
2715 }
2716 
2717 static void
if_flt_monitor_busy(struct ifnet * ifp)2718 if_flt_monitor_busy(struct ifnet *ifp)
2719 {
2720 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2721 
2722 	++ifp->if_flt_busy;
2723 	VERIFY(ifp->if_flt_busy != 0);
2724 }
2725 
2726 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2727 if_flt_monitor_unbusy(struct ifnet *ifp)
2728 {
2729 	if_flt_monitor_leave(ifp);
2730 }
2731 
2732 static void
if_flt_monitor_enter(struct ifnet * ifp)2733 if_flt_monitor_enter(struct ifnet *ifp)
2734 {
2735 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2736 
2737 	while (ifp->if_flt_busy) {
2738 		++ifp->if_flt_waiters;
2739 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2740 		    (PZERO - 1), "if_flt_monitor", NULL);
2741 	}
2742 	if_flt_monitor_busy(ifp);
2743 }
2744 
2745 static void
if_flt_monitor_leave(struct ifnet * ifp)2746 if_flt_monitor_leave(struct ifnet *ifp)
2747 {
2748 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2749 
2750 	VERIFY(ifp->if_flt_busy != 0);
2751 	--ifp->if_flt_busy;
2752 
2753 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2754 		ifp->if_flt_waiters = 0;
2755 		wakeup(&ifp->if_flt_head);
2756 	}
2757 }
2758 
2759 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2760 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2761     interface_filter_t *filter_ref, u_int32_t flags)
2762 {
2763 	int retval = 0;
2764 	struct ifnet_filter *filter = NULL;
2765 
2766 	ifnet_head_lock_shared();
2767 
2768 	/* Check that the interface is in the global list */
2769 	if (!ifnet_lookup(ifp)) {
2770 		retval = ENXIO;
2771 		goto done;
2772 	}
2773 	if (!ifnet_is_attached(ifp, 1)) {
2774 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2775 		    __func__, if_name(ifp));
2776 		retval = ENXIO;
2777 		goto done;
2778 	}
2779 
2780 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2781 
2782 	/* refcnt held above during lookup */
2783 	filter->filt_flags = flags;
2784 	filter->filt_ifp = ifp;
2785 	filter->filt_cookie = if_filter->iff_cookie;
2786 	filter->filt_name = if_filter->iff_name;
2787 	filter->filt_protocol = if_filter->iff_protocol;
2788 	/*
2789 	 * Do not install filter callbacks for internal coproc interface
2790 	 * and for management interfaces
2791 	 */
2792 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2793 		filter->filt_input = if_filter->iff_input;
2794 		filter->filt_output = if_filter->iff_output;
2795 		filter->filt_event = if_filter->iff_event;
2796 		filter->filt_ioctl = if_filter->iff_ioctl;
2797 	}
2798 	filter->filt_detached = if_filter->iff_detached;
2799 
2800 	lck_mtx_lock(&ifp->if_flt_lock);
2801 	if_flt_monitor_enter(ifp);
2802 
2803 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2804 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2805 
2806 	*filter_ref = filter;
2807 
2808 	/*
2809 	 * Bump filter count and route_generation ID to let TCP
2810 	 * know it shouldn't do TSO on this connection
2811 	 */
2812 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2813 		ifnet_filter_update_tso(ifp, TRUE);
2814 	}
2815 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2816 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2817 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2818 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2819 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2820 	} else {
2821 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2822 	}
2823 	if_flt_monitor_leave(ifp);
2824 	lck_mtx_unlock(&ifp->if_flt_lock);
2825 
2826 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2827 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2828 	    net_check_compatible_if_filter(NULL));
2829 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2830 
2831 	if (dlil_verbose) {
2832 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2833 		    if_filter->iff_name);
2834 	}
2835 	ifnet_decr_iorefcnt(ifp);
2836 
2837 done:
2838 	ifnet_head_done();
2839 	if (retval != 0 && ifp != NULL) {
2840 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2841 		    if_name(ifp), if_filter->iff_name, retval);
2842 	}
2843 	if (retval != 0 && filter != NULL) {
2844 		zfree(dlif_filt_zone, filter);
2845 	}
2846 
2847 	return retval;
2848 }
2849 
2850 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2851 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2852 {
2853 	int retval = 0;
2854 
2855 	if (detached == 0) {
2856 		ifnet_t ifp = NULL;
2857 
2858 		ifnet_head_lock_shared();
2859 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2860 			interface_filter_t entry = NULL;
2861 
2862 			lck_mtx_lock(&ifp->if_flt_lock);
2863 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2864 				if (entry != filter || entry->filt_skip) {
2865 					continue;
2866 				}
2867 				/*
2868 				 * We've found a match; since it's possible
2869 				 * that the thread gets blocked in the monitor,
2870 				 * we do the lock dance.  Interface should
2871 				 * not be detached since we still have a use
2872 				 * count held during filter attach.
2873 				 */
2874 				entry->filt_skip = 1;   /* skip input/output */
2875 				lck_mtx_unlock(&ifp->if_flt_lock);
2876 				ifnet_head_done();
2877 
2878 				lck_mtx_lock(&ifp->if_flt_lock);
2879 				if_flt_monitor_enter(ifp);
2880 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2881 				    LCK_MTX_ASSERT_OWNED);
2882 
2883 				/* Remove the filter from the list */
2884 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2885 				    filt_next);
2886 
2887 				if (dlil_verbose) {
2888 					DLIL_PRINTF("%s: %s filter detached\n",
2889 					    if_name(ifp), filter->filt_name);
2890 				}
2891 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2892 					VERIFY(ifp->if_flt_non_os_count != 0);
2893 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2894 				}
2895 				/*
2896 				 * Decrease filter count and route_generation
2897 				 * ID to let TCP know it should reevalute doing
2898 				 * TSO or not.
2899 				 */
2900 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2901 					ifnet_filter_update_tso(ifp, FALSE);
2902 				}
2903 				if_flt_monitor_leave(ifp);
2904 				lck_mtx_unlock(&ifp->if_flt_lock);
2905 				goto destroy;
2906 			}
2907 			lck_mtx_unlock(&ifp->if_flt_lock);
2908 		}
2909 		ifnet_head_done();
2910 
2911 		/* filter parameter is not a valid filter ref */
2912 		retval = EINVAL;
2913 		goto done;
2914 	} else {
2915 		struct ifnet *ifp = filter->filt_ifp;
2916 		/*
2917 		 * Here we are called from ifnet_detach_final(); the
2918 		 * caller had emptied if_flt_head and we're doing an
2919 		 * implicit filter detach because the interface is
2920 		 * about to go away.  Make sure to adjust the counters
2921 		 * in this case.  We don't need the protection of the
2922 		 * filter monitor since we're called as part of the
2923 		 * final detach in the context of the detacher thread.
2924 		 */
2925 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2926 			VERIFY(ifp->if_flt_non_os_count != 0);
2927 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2928 		}
2929 		/*
2930 		 * Decrease filter count and route_generation
2931 		 * ID to let TCP know it should reevalute doing
2932 		 * TSO or not.
2933 		 */
2934 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2935 			ifnet_filter_update_tso(ifp, FALSE);
2936 		}
2937 	}
2938 
2939 	if (dlil_verbose) {
2940 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2941 	}
2942 
2943 destroy:
2944 
2945 	/* Call the detached function if there is one */
2946 	if (filter->filt_detached) {
2947 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2948 	}
2949 
2950 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2951 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2952 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2953 	}
2954 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2955 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2956 	    net_check_compatible_if_filter(NULL));
2957 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2958 
2959 	/* Free the filter */
2960 	zfree(dlif_filt_zone, filter);
2961 	filter = NULL;
2962 done:
2963 	if (retval != 0 && filter != NULL) {
2964 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2965 		    filter->filt_name, retval);
2966 	}
2967 
2968 	return retval;
2969 }
2970 
2971 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2972 dlil_detach_filter(interface_filter_t filter)
2973 {
2974 	if (filter == NULL) {
2975 		return;
2976 	}
2977 	dlil_detach_filter_internal(filter, 0);
2978 }
2979 
2980 __private_extern__ boolean_t
dlil_has_ip_filter(void)2981 dlil_has_ip_filter(void)
2982 {
2983 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2984 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2985 	return has_filter;
2986 }
2987 
2988 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2989 dlil_has_if_filter(struct ifnet *ifp)
2990 {
2991 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2992 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2993 	return has_filter;
2994 }
2995 
2996 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2997 dlil_input_wakeup(struct dlil_threading_info *inp)
2998 {
2999 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3000 
3001 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3002 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3003 		inp->dlth_wtot++;
3004 		wakeup_one((caddr_t)&inp->dlth_flags);
3005 	}
3006 }
3007 
3008 __attribute__((noreturn))
3009 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3010 dlil_main_input_thread_func(void *v, wait_result_t w)
3011 {
3012 #pragma unused(w)
3013 	struct dlil_threading_info *inp = v;
3014 
3015 	VERIFY(inp == dlil_main_input_thread);
3016 	VERIFY(inp->dlth_ifp == NULL);
3017 	VERIFY(current_thread() == inp->dlth_thread);
3018 
3019 	lck_mtx_lock(&inp->dlth_lock);
3020 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3021 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3022 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3023 	/* wake up once to get out of embryonic state */
3024 	dlil_input_wakeup(inp);
3025 	lck_mtx_unlock(&inp->dlth_lock);
3026 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3027 	/* NOTREACHED */
3028 	__builtin_unreachable();
3029 }
3030 
3031 /*
3032  * Main input thread:
3033  *
3034  *   a) handles all inbound packets for lo0
3035  *   b) handles all inbound packets for interfaces with no dedicated
3036  *	input thread (e.g. anything but Ethernet/PDP or those that support
3037  *	opportunistic polling.)
3038  *   c) protocol registrations
3039  *   d) packet injections
3040  */
3041 __attribute__((noreturn))
3042 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3043 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3044 {
3045 	struct dlil_main_threading_info *inpm = v;
3046 	struct dlil_threading_info *inp = v;
3047 
3048 	/* main input thread is uninterruptible */
3049 	VERIFY(wres != THREAD_INTERRUPTED);
3050 	lck_mtx_lock_spin(&inp->dlth_lock);
3051 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3052 	    DLIL_INPUT_RUNNING)));
3053 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3054 
3055 	while (1) {
3056 		struct mbuf *m = NULL, *m_loop = NULL;
3057 		u_int32_t m_cnt, m_cnt_loop;
3058 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3059 		boolean_t proto_req;
3060 		boolean_t embryonic;
3061 
3062 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3063 
3064 		if (__improbable(embryonic =
3065 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3066 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3067 		}
3068 
3069 		proto_req = (inp->dlth_flags &
3070 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3071 
3072 		/* Packets for non-dedicated interfaces other than lo0 */
3073 		m_cnt = qlen(&inp->dlth_pkts);
3074 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3075 		m = pkt.cp_mbuf;
3076 
3077 		/* Packets exclusive to lo0 */
3078 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3079 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3080 		m_loop = pkt.cp_mbuf;
3081 
3082 		inp->dlth_wtot = 0;
3083 
3084 		lck_mtx_unlock(&inp->dlth_lock);
3085 
3086 		if (__improbable(embryonic)) {
3087 			dlil_decr_pending_thread_count();
3088 		}
3089 
3090 		/*
3091 		 * NOTE warning %%% attention !!!!
3092 		 * We should think about putting some thread starvation
3093 		 * safeguards if we deal with long chains of packets.
3094 		 */
3095 		if (__probable(m_loop != NULL)) {
3096 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3097 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3098 		}
3099 
3100 		if (__probable(m != NULL)) {
3101 			dlil_input_packet_list_extended(NULL, m,
3102 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3103 		}
3104 
3105 		if (__improbable(proto_req)) {
3106 			proto_input_run();
3107 		}
3108 
3109 		lck_mtx_lock_spin(&inp->dlth_lock);
3110 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3111 		/* main input thread cannot be terminated */
3112 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3113 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3114 			break;
3115 		}
3116 	}
3117 
3118 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3119 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3120 	lck_mtx_unlock(&inp->dlth_lock);
3121 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3122 
3123 	VERIFY(0);      /* we should never get here */
3124 	/* NOTREACHED */
3125 	__builtin_unreachable();
3126 }
3127 
3128 /*
3129  * Input thread for interfaces with legacy input model.
3130  */
3131 __attribute__((noreturn))
3132 static void
dlil_input_thread_func(void * v,wait_result_t w)3133 dlil_input_thread_func(void *v, wait_result_t w)
3134 {
3135 #pragma unused(w)
3136 	char thread_name[MAXTHREADNAMESIZE];
3137 	struct dlil_threading_info *inp = v;
3138 	struct ifnet *ifp = inp->dlth_ifp;
3139 
3140 	VERIFY(inp != dlil_main_input_thread);
3141 	VERIFY(ifp != NULL);
3142 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3143 	    !(ifp->if_xflags & IFXF_LEGACY));
3144 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3145 	    !(ifp->if_xflags & IFXF_LEGACY));
3146 	VERIFY(current_thread() == inp->dlth_thread);
3147 
3148 	/* construct the name for this thread, and then apply it */
3149 	bzero(thread_name, sizeof(thread_name));
3150 	(void) snprintf(thread_name, sizeof(thread_name),
3151 	    "dlil_input_%s", ifp->if_xname);
3152 	thread_set_thread_name(inp->dlth_thread, thread_name);
3153 
3154 	lck_mtx_lock(&inp->dlth_lock);
3155 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3156 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3157 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3158 	/* wake up once to get out of embryonic state */
3159 	dlil_input_wakeup(inp);
3160 	lck_mtx_unlock(&inp->dlth_lock);
3161 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3162 	/* NOTREACHED */
3163 	__builtin_unreachable();
3164 }
3165 
3166 __attribute__((noreturn))
3167 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3168 dlil_input_thread_cont(void *v, wait_result_t wres)
3169 {
3170 	struct dlil_threading_info *inp = v;
3171 	struct ifnet *ifp = inp->dlth_ifp;
3172 
3173 	lck_mtx_lock_spin(&inp->dlth_lock);
3174 	if (__improbable(wres == THREAD_INTERRUPTED ||
3175 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3176 		goto terminate;
3177 	}
3178 
3179 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3180 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3181 
3182 	while (1) {
3183 		struct mbuf *m = NULL;
3184 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3185 		boolean_t notify = FALSE;
3186 		boolean_t embryonic;
3187 		u_int32_t m_cnt;
3188 
3189 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3190 
3191 		if (__improbable(embryonic =
3192 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3193 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3194 		}
3195 
3196 		/*
3197 		 * Protocol registration and injection must always use
3198 		 * the main input thread; in theory the latter can utilize
3199 		 * the corresponding input thread where the packet arrived
3200 		 * on, but that requires our knowing the interface in advance
3201 		 * (and the benefits might not worth the trouble.)
3202 		 */
3203 		VERIFY(!(inp->dlth_flags &
3204 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3205 
3206 		/* Packets for this interface */
3207 		m_cnt = qlen(&inp->dlth_pkts);
3208 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3209 		m = pkt.cp_mbuf;
3210 
3211 		inp->dlth_wtot = 0;
3212 
3213 #if SKYWALK
3214 		/*
3215 		 * If this interface is attached to a netif nexus,
3216 		 * the stats are already incremented there; otherwise
3217 		 * do it here.
3218 		 */
3219 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3220 #endif /* SKYWALK */
3221 		notify = dlil_input_stats_sync(ifp, inp);
3222 
3223 		lck_mtx_unlock(&inp->dlth_lock);
3224 
3225 		if (__improbable(embryonic)) {
3226 			ifnet_decr_pending_thread_count(ifp);
3227 		}
3228 
3229 		if (__improbable(notify)) {
3230 			ifnet_notify_data_threshold(ifp);
3231 		}
3232 
3233 		/*
3234 		 * NOTE warning %%% attention !!!!
3235 		 * We should think about putting some thread starvation
3236 		 * safeguards if we deal with long chains of packets.
3237 		 */
3238 		if (__probable(m != NULL)) {
3239 			dlil_input_packet_list_extended(NULL, m,
3240 			    m_cnt, ifp->if_poll_mode);
3241 		}
3242 
3243 		lck_mtx_lock_spin(&inp->dlth_lock);
3244 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3245 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3246 		    DLIL_INPUT_TERMINATE))) {
3247 			break;
3248 		}
3249 	}
3250 
3251 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3252 
3253 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3254 terminate:
3255 		lck_mtx_unlock(&inp->dlth_lock);
3256 		dlil_terminate_input_thread(inp);
3257 		/* NOTREACHED */
3258 	} else {
3259 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3260 		lck_mtx_unlock(&inp->dlth_lock);
3261 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3262 		/* NOTREACHED */
3263 	}
3264 
3265 	VERIFY(0);      /* we should never get here */
3266 	/* NOTREACHED */
3267 	__builtin_unreachable();
3268 }
3269 
3270 /*
3271  * Input thread for interfaces with opportunistic polling input model.
3272  */
3273 __attribute__((noreturn))
3274 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3275 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3276 {
3277 #pragma unused(w)
3278 	char thread_name[MAXTHREADNAMESIZE];
3279 	struct dlil_threading_info *inp = v;
3280 	struct ifnet *ifp = inp->dlth_ifp;
3281 
3282 	VERIFY(inp != dlil_main_input_thread);
3283 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3284 	    (ifp->if_xflags & IFXF_LEGACY));
3285 	VERIFY(current_thread() == inp->dlth_thread);
3286 
3287 	/* construct the name for this thread, and then apply it */
3288 	bzero(thread_name, sizeof(thread_name));
3289 	(void) snprintf(thread_name, sizeof(thread_name),
3290 	    "dlil_input_poll_%s", ifp->if_xname);
3291 	thread_set_thread_name(inp->dlth_thread, thread_name);
3292 
3293 	lck_mtx_lock(&inp->dlth_lock);
3294 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3295 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3296 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3297 	/* wake up once to get out of embryonic state */
3298 	dlil_input_wakeup(inp);
3299 	lck_mtx_unlock(&inp->dlth_lock);
3300 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3301 	/* NOTREACHED */
3302 	__builtin_unreachable();
3303 }
3304 
3305 __attribute__((noreturn))
3306 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3307 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3308 {
3309 	struct dlil_threading_info *inp = v;
3310 	struct ifnet *ifp = inp->dlth_ifp;
3311 	struct timespec ts;
3312 
3313 	lck_mtx_lock_spin(&inp->dlth_lock);
3314 	if (__improbable(wres == THREAD_INTERRUPTED ||
3315 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3316 		goto terminate;
3317 	}
3318 
3319 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3320 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3321 
3322 	while (1) {
3323 		struct mbuf *m = NULL;
3324 		uint32_t m_cnt, poll_req = 0;
3325 		uint64_t m_size = 0;
3326 		ifnet_model_t mode;
3327 		struct timespec now, delta;
3328 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3329 		boolean_t notify;
3330 		boolean_t embryonic;
3331 		uint64_t ival;
3332 
3333 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3334 
3335 		if (__improbable(embryonic =
3336 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3337 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3338 			goto skip;
3339 		}
3340 
3341 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3342 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3343 		}
3344 
3345 		/* Link parameters changed? */
3346 		if (ifp->if_poll_update != 0) {
3347 			ifp->if_poll_update = 0;
3348 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3349 		}
3350 
3351 		/* Current operating mode */
3352 		mode = ifp->if_poll_mode;
3353 
3354 		/*
3355 		 * Protocol registration and injection must always use
3356 		 * the main input thread; in theory the latter can utilize
3357 		 * the corresponding input thread where the packet arrived
3358 		 * on, but that requires our knowing the interface in advance
3359 		 * (and the benefits might not worth the trouble.)
3360 		 */
3361 		VERIFY(!(inp->dlth_flags &
3362 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3363 
3364 		/* Total count of all packets */
3365 		m_cnt = qlen(&inp->dlth_pkts);
3366 
3367 		/* Total bytes of all packets */
3368 		m_size = qsize(&inp->dlth_pkts);
3369 
3370 		/* Packets for this interface */
3371 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3372 		m = pkt.cp_mbuf;
3373 		VERIFY(m != NULL || m_cnt == 0);
3374 
3375 		nanouptime(&now);
3376 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3377 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3378 		}
3379 
3380 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3381 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3382 			u_int32_t ptot, btot;
3383 
3384 			/* Accumulate statistics for current sampling */
3385 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3386 
3387 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3388 				goto skip;
3389 			}
3390 
3391 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3392 
3393 			/* Calculate min/max of inbound bytes */
3394 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3395 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3396 				ifp->if_rxpoll_bmin = btot;
3397 			}
3398 			if (btot > ifp->if_rxpoll_bmax) {
3399 				ifp->if_rxpoll_bmax = btot;
3400 			}
3401 
3402 			/* Calculate EWMA of inbound bytes */
3403 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3404 
3405 			/* Calculate min/max of inbound packets */
3406 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3407 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3408 				ifp->if_rxpoll_pmin = ptot;
3409 			}
3410 			if (ptot > ifp->if_rxpoll_pmax) {
3411 				ifp->if_rxpoll_pmax = ptot;
3412 			}
3413 
3414 			/* Calculate EWMA of inbound packets */
3415 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3416 
3417 			/* Reset sampling statistics */
3418 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3419 
3420 			/* Calculate EWMA of wakeup requests */
3421 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3422 			    if_rxpoll_decay);
3423 			inp->dlth_wtot = 0;
3424 
3425 			if (dlil_verbose) {
3426 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3427 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3428 				}
3429 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3430 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3431 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3432 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3433 					    "limits [%d/%d], wreq avg %d "
3434 					    "limits [%d/%d], bytes avg %d "
3435 					    "limits [%d/%d]\n", if_name(ifp),
3436 					    (ifp->if_poll_mode ==
3437 					    IFNET_MODEL_INPUT_POLL_ON) ?
3438 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3439 					    ifp->if_rxpoll_pmax,
3440 					    ifp->if_rxpoll_plowat,
3441 					    ifp->if_rxpoll_phiwat,
3442 					    ifp->if_rxpoll_wavg,
3443 					    ifp->if_rxpoll_wlowat,
3444 					    ifp->if_rxpoll_whiwat,
3445 					    ifp->if_rxpoll_bavg,
3446 					    ifp->if_rxpoll_blowat,
3447 					    ifp->if_rxpoll_bhiwat);
3448 				}
3449 			}
3450 
3451 			/* Perform mode transition, if necessary */
3452 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3453 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3454 			}
3455 
3456 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3457 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3458 				goto skip;
3459 			}
3460 
3461 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3462 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3463 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3464 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3465 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3466 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3467 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3468 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3469 				mode = IFNET_MODEL_INPUT_POLL_ON;
3470 			}
3471 
3472 			if (mode != ifp->if_poll_mode) {
3473 				ifp->if_poll_mode = mode;
3474 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3475 				poll_req++;
3476 			}
3477 		}
3478 skip:
3479 		notify = dlil_input_stats_sync(ifp, inp);
3480 
3481 		lck_mtx_unlock(&inp->dlth_lock);
3482 
3483 		if (__improbable(embryonic)) {
3484 			ifnet_decr_pending_thread_count(ifp);
3485 		}
3486 
3487 		if (__improbable(notify)) {
3488 			ifnet_notify_data_threshold(ifp);
3489 		}
3490 
3491 		/*
3492 		 * If there's a mode change and interface is still attached,
3493 		 * perform a downcall to the driver for the new mode.  Also
3494 		 * hold an IO refcnt on the interface to prevent it from
3495 		 * being detached (will be release below.)
3496 		 */
3497 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3498 			struct ifnet_model_params p = {
3499 				.model = mode, .reserved = { 0 }
3500 			};
3501 			errno_t err;
3502 
3503 			if (dlil_verbose) {
3504 				DLIL_PRINTF("%s: polling is now %s, "
3505 				    "pkts avg %d max %d limits [%d/%d], "
3506 				    "wreq avg %d limits [%d/%d], "
3507 				    "bytes avg %d limits [%d/%d]\n",
3508 				    if_name(ifp),
3509 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3510 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3511 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3512 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3513 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3514 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3515 				    ifp->if_rxpoll_bhiwat);
3516 			}
3517 
3518 			if ((err = ((*ifp->if_input_ctl)(ifp,
3519 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3520 				DLIL_PRINTF("%s: error setting polling mode "
3521 				    "to %s (%d)\n", if_name(ifp),
3522 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3523 				    "ON" : "OFF", err);
3524 			}
3525 
3526 			switch (mode) {
3527 			case IFNET_MODEL_INPUT_POLL_OFF:
3528 				ifnet_set_poll_cycle(ifp, NULL);
3529 				ifp->if_rxpoll_offreq++;
3530 				if (err != 0) {
3531 					ifp->if_rxpoll_offerr++;
3532 				}
3533 				break;
3534 
3535 			case IFNET_MODEL_INPUT_POLL_ON:
3536 				net_nsectimer(&ival, &ts);
3537 				ifnet_set_poll_cycle(ifp, &ts);
3538 				ifnet_poll(ifp);
3539 				ifp->if_rxpoll_onreq++;
3540 				if (err != 0) {
3541 					ifp->if_rxpoll_onerr++;
3542 				}
3543 				break;
3544 
3545 			default:
3546 				VERIFY(0);
3547 				/* NOTREACHED */
3548 			}
3549 
3550 			/* Release the IO refcnt */
3551 			ifnet_decr_iorefcnt(ifp);
3552 		}
3553 
3554 		/*
3555 		 * NOTE warning %%% attention !!!!
3556 		 * We should think about putting some thread starvation
3557 		 * safeguards if we deal with long chains of packets.
3558 		 */
3559 		if (__probable(m != NULL)) {
3560 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3561 		}
3562 
3563 		lck_mtx_lock_spin(&inp->dlth_lock);
3564 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3565 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3566 		    DLIL_INPUT_TERMINATE))) {
3567 			break;
3568 		}
3569 	}
3570 
3571 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3572 
3573 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3574 terminate:
3575 		lck_mtx_unlock(&inp->dlth_lock);
3576 		dlil_terminate_input_thread(inp);
3577 		/* NOTREACHED */
3578 	} else {
3579 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3580 		lck_mtx_unlock(&inp->dlth_lock);
3581 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3582 		    inp);
3583 		/* NOTREACHED */
3584 	}
3585 
3586 	VERIFY(0);      /* we should never get here */
3587 	/* NOTREACHED */
3588 	__builtin_unreachable();
3589 }
3590 
3591 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3592 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3593 {
3594 	if (p != NULL) {
3595 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3596 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3597 			return EINVAL;
3598 		}
3599 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3600 		    p->packets_lowat >= p->packets_hiwat) {
3601 			return EINVAL;
3602 		}
3603 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3604 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3605 			return EINVAL;
3606 		}
3607 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3608 		    p->bytes_lowat >= p->bytes_hiwat) {
3609 			return EINVAL;
3610 		}
3611 		if (p->interval_time != 0 &&
3612 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3613 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3614 		}
3615 	}
3616 	return 0;
3617 }
3618 
3619 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3620 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3621 {
3622 	u_int64_t sample_holdtime, inbw;
3623 
3624 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3625 		sample_holdtime = 0;    /* polling is disabled */
3626 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3627 		    ifp->if_rxpoll_blowat = 0;
3628 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3629 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3630 		ifp->if_rxpoll_plim = 0;
3631 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3632 	} else {
3633 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3634 		u_int64_t ival;
3635 		unsigned int n, i;
3636 
3637 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3638 			if (inbw < rxpoll_tbl[i].speed) {
3639 				break;
3640 			}
3641 			n = i;
3642 		}
3643 		/* auto-tune if caller didn't specify a value */
3644 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3645 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3646 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3647 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3648 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3649 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3650 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3651 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3652 		plim = ((p == NULL || p->packets_limit == 0 ||
3653 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3654 		ival = ((p == NULL || p->interval_time == 0 ||
3655 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3656 		    if_rxpoll_interval_time : p->interval_time);
3657 
3658 		VERIFY(plowat != 0 && phiwat != 0);
3659 		VERIFY(blowat != 0 && bhiwat != 0);
3660 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3661 
3662 		sample_holdtime = if_rxpoll_sample_holdtime;
3663 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3664 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3665 		ifp->if_rxpoll_plowat = plowat;
3666 		ifp->if_rxpoll_phiwat = phiwat;
3667 		ifp->if_rxpoll_blowat = blowat;
3668 		ifp->if_rxpoll_bhiwat = bhiwat;
3669 		ifp->if_rxpoll_plim = plim;
3670 		ifp->if_rxpoll_ival = ival;
3671 	}
3672 
3673 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3674 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3675 
3676 	if (dlil_verbose) {
3677 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3678 		    "poll interval %llu nsec, pkts per poll %u, "
3679 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3680 		    "bytes limits [%u/%u]\n", if_name(ifp),
3681 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3682 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3683 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3684 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3685 		    ifp->if_rxpoll_bhiwat);
3686 	}
3687 }
3688 
3689 /*
3690  * Must be called on an attached ifnet (caller is expected to check.)
3691  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3692  */
3693 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3694 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3695     boolean_t locked)
3696 {
3697 	errno_t err;
3698 	struct dlil_threading_info *inp;
3699 
3700 	VERIFY(ifp != NULL);
3701 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3702 		return ENXIO;
3703 	}
3704 	err = dlil_rxpoll_validate_params(p);
3705 	if (err != 0) {
3706 		return err;
3707 	}
3708 
3709 	if (!locked) {
3710 		lck_mtx_lock(&inp->dlth_lock);
3711 	}
3712 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3713 	/*
3714 	 * Normally, we'd reset the parameters to the auto-tuned values
3715 	 * if the the input thread detects a change in link rate.  If the
3716 	 * driver provides its own parameters right after a link rate
3717 	 * changes, but before the input thread gets to run, we want to
3718 	 * make sure to keep the driver's values.  Clearing if_poll_update
3719 	 * will achieve that.
3720 	 */
3721 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3722 		ifp->if_poll_update = 0;
3723 	}
3724 	dlil_rxpoll_update_params(ifp, p);
3725 	if (!locked) {
3726 		lck_mtx_unlock(&inp->dlth_lock);
3727 	}
3728 	return 0;
3729 }
3730 
3731 /*
3732  * Must be called on an attached ifnet (caller is expected to check.)
3733  */
3734 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3735 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3736 {
3737 	struct dlil_threading_info *inp;
3738 
3739 	VERIFY(ifp != NULL && p != NULL);
3740 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3741 		return ENXIO;
3742 	}
3743 
3744 	bzero(p, sizeof(*p));
3745 
3746 	lck_mtx_lock(&inp->dlth_lock);
3747 	p->packets_limit = ifp->if_rxpoll_plim;
3748 	p->packets_lowat = ifp->if_rxpoll_plowat;
3749 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3750 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3751 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3752 	p->interval_time = ifp->if_rxpoll_ival;
3753 	lck_mtx_unlock(&inp->dlth_lock);
3754 
3755 	return 0;
3756 }
3757 
3758 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3759 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3760     const struct ifnet_stat_increment_param *s)
3761 {
3762 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3763 }
3764 
3765 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3766 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3767     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3768 {
3769 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3770 }
3771 
3772 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3773 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3774     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3775 {
3776 	return ifnet_input_common(ifp, m_head, m_tail, s,
3777 	           (m_head != NULL), TRUE);
3778 }
3779 
3780 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3781 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3782     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3783 {
3784 	dlil_input_func input_func;
3785 	struct ifnet_stat_increment_param _s;
3786 	u_int32_t m_cnt = 0, m_size = 0;
3787 	struct mbuf *last;
3788 	errno_t err = 0;
3789 
3790 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3791 		if (m_head != NULL) {
3792 			mbuf_freem_list(m_head);
3793 		}
3794 		return EINVAL;
3795 	}
3796 
3797 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3798 	VERIFY(m_tail == NULL || ext);
3799 	VERIFY(s != NULL || !ext);
3800 
3801 	/*
3802 	 * Drop the packet(s) if the parameters are invalid, or if the
3803 	 * interface is no longer attached; else hold an IO refcnt to
3804 	 * prevent it from being detached (will be released below.)
3805 	 */
3806 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3807 		if (m_head != NULL) {
3808 			mbuf_freem_list(m_head);
3809 		}
3810 		return EINVAL;
3811 	}
3812 
3813 	input_func = ifp->if_input_dlil;
3814 	VERIFY(input_func != NULL);
3815 
3816 	if (m_tail == NULL) {
3817 		last = m_head;
3818 		while (m_head != NULL) {
3819 #if IFNET_INPUT_SANITY_CHK
3820 			if (__improbable(dlil_input_sanity_check != 0)) {
3821 				DLIL_INPUT_CHECK(last, ifp);
3822 			}
3823 #endif /* IFNET_INPUT_SANITY_CHK */
3824 			m_cnt++;
3825 			m_size += m_length(last);
3826 			if (mbuf_nextpkt(last) == NULL) {
3827 				break;
3828 			}
3829 			last = mbuf_nextpkt(last);
3830 		}
3831 		m_tail = last;
3832 	} else {
3833 #if IFNET_INPUT_SANITY_CHK
3834 		if (__improbable(dlil_input_sanity_check != 0)) {
3835 			last = m_head;
3836 			while (1) {
3837 				DLIL_INPUT_CHECK(last, ifp);
3838 				m_cnt++;
3839 				m_size += m_length(last);
3840 				if (mbuf_nextpkt(last) == NULL) {
3841 					break;
3842 				}
3843 				last = mbuf_nextpkt(last);
3844 			}
3845 		} else {
3846 			m_cnt = s->packets_in;
3847 			m_size = s->bytes_in;
3848 			last = m_tail;
3849 		}
3850 #else
3851 		m_cnt = s->packets_in;
3852 		m_size = s->bytes_in;
3853 		last = m_tail;
3854 #endif /* IFNET_INPUT_SANITY_CHK */
3855 	}
3856 
3857 	if (last != m_tail) {
3858 		panic_plain("%s: invalid input packet chain for %s, "
3859 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3860 		    m_tail, last);
3861 	}
3862 
3863 	/*
3864 	 * Assert packet count only for the extended variant, for backwards
3865 	 * compatibility, since this came directly from the device driver.
3866 	 * Relax this assertion for input bytes, as the driver may have
3867 	 * included the link-layer headers in the computation; hence
3868 	 * m_size is just an approximation.
3869 	 */
3870 	if (ext && s->packets_in != m_cnt) {
3871 		panic_plain("%s: input packet count mismatch for %s, "
3872 		    "%d instead of %d\n", __func__, if_name(ifp),
3873 		    s->packets_in, m_cnt);
3874 	}
3875 
3876 	if (s == NULL) {
3877 		bzero(&_s, sizeof(_s));
3878 		s = &_s;
3879 	} else {
3880 		_s = *s;
3881 	}
3882 	_s.packets_in = m_cnt;
3883 	_s.bytes_in = m_size;
3884 
3885 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3886 
3887 	if (ifp != lo_ifp) {
3888 		/* Release the IO refcnt */
3889 		ifnet_datamov_end(ifp);
3890 	}
3891 
3892 	return err;
3893 }
3894 
3895 #if SKYWALK
3896 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3897 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3898 {
3899 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3900 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3901 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3902 }
3903 
3904 void
dlil_reset_input_handler(struct ifnet * ifp)3905 dlil_reset_input_handler(struct ifnet *ifp)
3906 {
3907 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3908 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3909 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3910 		;
3911 	}
3912 }
3913 
3914 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3915 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3916 {
3917 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3918 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3919 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3920 }
3921 
3922 void
dlil_reset_output_handler(struct ifnet * ifp)3923 dlil_reset_output_handler(struct ifnet *ifp)
3924 {
3925 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3926 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3927 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3928 		;
3929 	}
3930 }
3931 #endif /* SKYWALK */
3932 
3933 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3934 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3935 {
3936 	return ifp->if_output(ifp, m);
3937 }
3938 
3939 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3940 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3941     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3942     boolean_t poll, struct thread *tp)
3943 {
3944 	struct dlil_threading_info *inp = ifp->if_inp;
3945 
3946 	if (__improbable(inp == NULL)) {
3947 		inp = dlil_main_input_thread;
3948 	}
3949 
3950 #if (DEVELOPMENT || DEBUG)
3951 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3952 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3953 	} else
3954 #endif /* (DEVELOPMENT || DEBUG) */
3955 	{
3956 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3957 	}
3958 }
3959 
3960 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3961 dlil_input_async(struct dlil_threading_info *inp,
3962     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963     const struct ifnet_stat_increment_param *s, boolean_t poll,
3964     struct thread *tp)
3965 {
3966 	u_int32_t m_cnt = s->packets_in;
3967 	u_int32_t m_size = s->bytes_in;
3968 	boolean_t notify = FALSE;
3969 
3970 	/*
3971 	 * If there is a matching DLIL input thread associated with an
3972 	 * affinity set, associate this thread with the same set.  We
3973 	 * will only do this once.
3974 	 */
3975 	lck_mtx_lock_spin(&inp->dlth_lock);
3976 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3977 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3978 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3979 		u_int32_t tag = inp->dlth_affinity_tag;
3980 
3981 		if (poll) {
3982 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3983 			inp->dlth_poller_thread = tp;
3984 		} else {
3985 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3986 			inp->dlth_driver_thread = tp;
3987 		}
3988 		lck_mtx_unlock(&inp->dlth_lock);
3989 
3990 		/* Associate the current thread with the new affinity tag */
3991 		(void) dlil_affinity_set(tp, tag);
3992 
3993 		/*
3994 		 * Take a reference on the current thread; during detach,
3995 		 * we will need to refer to it in order to tear down its
3996 		 * affinity.
3997 		 */
3998 		thread_reference(tp);
3999 		lck_mtx_lock_spin(&inp->dlth_lock);
4000 	}
4001 
4002 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4003 
4004 	/*
4005 	 * Because of loopbacked multicast we cannot stuff the ifp in
4006 	 * the rcvif of the packet header: loopback (lo0) packets use a
4007 	 * dedicated list so that we can later associate them with lo_ifp
4008 	 * on their way up the stack.  Packets for other interfaces without
4009 	 * dedicated input threads go to the regular list.
4010 	 */
4011 	if (m_head != NULL) {
4012 		classq_pkt_t head, tail;
4013 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4014 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4015 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4016 			struct dlil_main_threading_info *inpm =
4017 			    (struct dlil_main_threading_info *)inp;
4018 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4019 			    m_cnt, m_size);
4020 		} else {
4021 			_addq_multi(&inp->dlth_pkts, &head, &tail,
4022 			    m_cnt, m_size);
4023 		}
4024 	}
4025 
4026 #if IFNET_INPUT_SANITY_CHK
4027 	if (__improbable(dlil_input_sanity_check != 0)) {
4028 		u_int32_t count = 0, size = 0;
4029 		struct mbuf *m0;
4030 
4031 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4032 			size += m_length(m0);
4033 			count++;
4034 		}
4035 
4036 		if (count != m_cnt) {
4037 			panic_plain("%s: invalid total packet count %u "
4038 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4039 			/* NOTREACHED */
4040 			__builtin_unreachable();
4041 		} else if (size != m_size) {
4042 			panic_plain("%s: invalid total packet size %u "
4043 			    "(expected %u)\n", if_name(ifp), size, m_size);
4044 			/* NOTREACHED */
4045 			__builtin_unreachable();
4046 		}
4047 
4048 		inp->dlth_pkts_cnt += m_cnt;
4049 	}
4050 #endif /* IFNET_INPUT_SANITY_CHK */
4051 
4052 	dlil_input_stats_add(s, inp, ifp, poll);
4053 	/*
4054 	 * If we're using the main input thread, synchronize the
4055 	 * stats now since we have the interface context.  All
4056 	 * other cases involving dedicated input threads will
4057 	 * have their stats synchronized there.
4058 	 */
4059 	if (inp == dlil_main_input_thread) {
4060 		notify = dlil_input_stats_sync(ifp, inp);
4061 	}
4062 
4063 	dlil_input_wakeup(inp);
4064 	lck_mtx_unlock(&inp->dlth_lock);
4065 
4066 	if (notify) {
4067 		ifnet_notify_data_threshold(ifp);
4068 	}
4069 
4070 	return 0;
4071 }
4072 
4073 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4074 dlil_input_sync(struct dlil_threading_info *inp,
4075     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4076     const struct ifnet_stat_increment_param *s, boolean_t poll,
4077     struct thread *tp)
4078 {
4079 #pragma unused(tp)
4080 	u_int32_t m_cnt = s->packets_in;
4081 	u_int32_t m_size = s->bytes_in;
4082 	boolean_t notify = FALSE;
4083 	classq_pkt_t head, tail;
4084 
4085 	ASSERT(inp != dlil_main_input_thread);
4086 
4087 	/* XXX: should we just assert instead? */
4088 	if (__improbable(m_head == NULL)) {
4089 		return 0;
4090 	}
4091 
4092 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4093 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4094 
4095 	lck_mtx_lock_spin(&inp->dlth_lock);
4096 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4097 
4098 #if IFNET_INPUT_SANITY_CHK
4099 	if (__improbable(dlil_input_sanity_check != 0)) {
4100 		u_int32_t count = 0, size = 0;
4101 		struct mbuf *m0;
4102 
4103 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4104 			size += m_length(m0);
4105 			count++;
4106 		}
4107 
4108 		if (count != m_cnt) {
4109 			panic_plain("%s: invalid total packet count %u "
4110 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4111 			/* NOTREACHED */
4112 			__builtin_unreachable();
4113 		} else if (size != m_size) {
4114 			panic_plain("%s: invalid total packet size %u "
4115 			    "(expected %u)\n", if_name(ifp), size, m_size);
4116 			/* NOTREACHED */
4117 			__builtin_unreachable();
4118 		}
4119 
4120 		inp->dlth_pkts_cnt += m_cnt;
4121 	}
4122 #endif /* IFNET_INPUT_SANITY_CHK */
4123 
4124 	dlil_input_stats_add(s, inp, ifp, poll);
4125 
4126 	m_cnt = qlen(&inp->dlth_pkts);
4127 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4128 
4129 #if SKYWALK
4130 	/*
4131 	 * If this interface is attached to a netif nexus,
4132 	 * the stats are already incremented there; otherwise
4133 	 * do it here.
4134 	 */
4135 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4136 #endif /* SKYWALK */
4137 	notify = dlil_input_stats_sync(ifp, inp);
4138 
4139 	lck_mtx_unlock(&inp->dlth_lock);
4140 
4141 	if (notify) {
4142 		ifnet_notify_data_threshold(ifp);
4143 	}
4144 
4145 	/*
4146 	 * NOTE warning %%% attention !!!!
4147 	 * We should think about putting some thread starvation
4148 	 * safeguards if we deal with long chains of packets.
4149 	 */
4150 	if (head.cp_mbuf != NULL) {
4151 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4152 		    m_cnt, ifp->if_poll_mode);
4153 	}
4154 
4155 	return 0;
4156 }
4157 
4158 #if SKYWALK
4159 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4160 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4161 {
4162 	return atomic_test_set_ptr(&ifp->if_output,
4163 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4164 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4165 }
4166 
4167 void
ifnet_reset_output_handler(struct ifnet * ifp)4168 ifnet_reset_output_handler(struct ifnet *ifp)
4169 {
4170 	while (!atomic_test_set_ptr(&ifp->if_output,
4171 	    ptrauth_nop_cast(void *, ifp->if_output),
4172 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4173 		;
4174 	}
4175 }
4176 
4177 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4178 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4179 {
4180 	return atomic_test_set_ptr(&ifp->if_start,
4181 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4182 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4183 }
4184 
4185 void
ifnet_reset_start_handler(struct ifnet * ifp)4186 ifnet_reset_start_handler(struct ifnet *ifp)
4187 {
4188 	while (!atomic_test_set_ptr(&ifp->if_start,
4189 	    ptrauth_nop_cast(void *, ifp->if_start),
4190 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4191 		;
4192 	}
4193 }
4194 #endif /* SKYWALK */
4195 
4196 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4197 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4198 {
4199 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4200 		return;
4201 	}
4202 	/*
4203 	 * If the starter thread is inactive, signal it to do work,
4204 	 * unless the interface is being flow controlled from below,
4205 	 * e.g. a virtual interface being flow controlled by a real
4206 	 * network interface beneath it, or it's been disabled via
4207 	 * a call to ifnet_disable_output().
4208 	 */
4209 	lck_mtx_lock_spin(&ifp->if_start_lock);
4210 	if (resetfc) {
4211 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4212 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4213 		lck_mtx_unlock(&ifp->if_start_lock);
4214 		return;
4215 	}
4216 	ifp->if_start_req++;
4217 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4218 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4219 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4220 	    ifp->if_start_delayed == 0)) {
4221 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4222 	}
4223 	lck_mtx_unlock(&ifp->if_start_lock);
4224 }
4225 
4226 void
ifnet_start(struct ifnet * ifp)4227 ifnet_start(struct ifnet *ifp)
4228 {
4229 	ifnet_start_common(ifp, FALSE);
4230 }
4231 
4232 __attribute__((noreturn))
4233 static void
ifnet_start_thread_func(void * v,wait_result_t w)4234 ifnet_start_thread_func(void *v, wait_result_t w)
4235 {
4236 #pragma unused(w)
4237 	struct ifnet *ifp = v;
4238 	char thread_name[MAXTHREADNAMESIZE];
4239 
4240 	/* Construct the name for this thread, and then apply it. */
4241 	bzero(thread_name, sizeof(thread_name));
4242 	(void) snprintf(thread_name, sizeof(thread_name),
4243 	    "ifnet_start_%s", ifp->if_xname);
4244 #if SKYWALK
4245 	/* override name for native Skywalk interface */
4246 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4247 		(void) snprintf(thread_name, sizeof(thread_name),
4248 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4249 	}
4250 #endif /* SKYWALK */
4251 	ASSERT(ifp->if_start_thread == current_thread());
4252 	thread_set_thread_name(current_thread(), thread_name);
4253 
4254 	/*
4255 	 * Treat the dedicated starter thread for lo0 as equivalent to
4256 	 * the driver workloop thread; if net_affinity is enabled for
4257 	 * the main input thread, associate this starter thread to it
4258 	 * by binding them with the same affinity tag.  This is done
4259 	 * only once (as we only have one lo_ifp which never goes away.)
4260 	 */
4261 	if (ifp == lo_ifp) {
4262 		struct dlil_threading_info *inp = dlil_main_input_thread;
4263 		struct thread *tp = current_thread();
4264 #if SKYWALK
4265 		/* native skywalk loopback not yet implemented */
4266 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4267 #endif /* SKYWALK */
4268 
4269 		lck_mtx_lock(&inp->dlth_lock);
4270 		if (inp->dlth_affinity) {
4271 			u_int32_t tag = inp->dlth_affinity_tag;
4272 
4273 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4274 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4275 			inp->dlth_driver_thread = tp;
4276 			lck_mtx_unlock(&inp->dlth_lock);
4277 
4278 			/* Associate this thread with the affinity tag */
4279 			(void) dlil_affinity_set(tp, tag);
4280 		} else {
4281 			lck_mtx_unlock(&inp->dlth_lock);
4282 		}
4283 	}
4284 
4285 	lck_mtx_lock(&ifp->if_start_lock);
4286 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4287 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4288 	ifp->if_start_embryonic = 1;
4289 	/* wake up once to get out of embryonic state */
4290 	ifp->if_start_req++;
4291 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4292 	lck_mtx_unlock(&ifp->if_start_lock);
4293 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4294 	/* NOTREACHED */
4295 	__builtin_unreachable();
4296 }
4297 
4298 __attribute__((noreturn))
4299 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4300 ifnet_start_thread_cont(void *v, wait_result_t wres)
4301 {
4302 	struct ifnet *ifp = v;
4303 	struct ifclassq *ifq = ifp->if_snd;
4304 
4305 	lck_mtx_lock_spin(&ifp->if_start_lock);
4306 	if (__improbable(wres == THREAD_INTERRUPTED ||
4307 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4308 		goto terminate;
4309 	}
4310 
4311 	if (__improbable(ifp->if_start_embryonic)) {
4312 		ifp->if_start_embryonic = 0;
4313 		lck_mtx_unlock(&ifp->if_start_lock);
4314 		ifnet_decr_pending_thread_count(ifp);
4315 		lck_mtx_lock_spin(&ifp->if_start_lock);
4316 		goto skip;
4317 	}
4318 
4319 	ifp->if_start_active = 1;
4320 
4321 	/*
4322 	 * Keep on servicing until no more request.
4323 	 */
4324 	for (;;) {
4325 		u_int32_t req = ifp->if_start_req;
4326 		if (!IFCQ_IS_EMPTY(ifq) &&
4327 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4328 		    ifp->if_start_delayed == 0 &&
4329 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4330 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4331 			ifp->if_start_delayed = 1;
4332 			ifnet_start_delayed++;
4333 			break;
4334 		}
4335 		ifp->if_start_delayed = 0;
4336 		lck_mtx_unlock(&ifp->if_start_lock);
4337 
4338 		/*
4339 		 * If no longer attached, don't call start because ifp
4340 		 * is being destroyed; else hold an IO refcnt to
4341 		 * prevent the interface from being detached (will be
4342 		 * released below.)
4343 		 */
4344 		if (!ifnet_datamov_begin(ifp)) {
4345 			lck_mtx_lock_spin(&ifp->if_start_lock);
4346 			break;
4347 		}
4348 
4349 		/* invoke the driver's start routine */
4350 		((*ifp->if_start)(ifp));
4351 
4352 		/*
4353 		 * Release the io ref count taken above.
4354 		 */
4355 		ifnet_datamov_end(ifp);
4356 
4357 		lck_mtx_lock_spin(&ifp->if_start_lock);
4358 
4359 		/*
4360 		 * If there's no pending request or if the
4361 		 * interface has been disabled, we're done.
4362 		 */
4363 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4364 		if (req == ifp->if_start_req ||
4365 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4366 			break;
4367 		}
4368 	}
4369 skip:
4370 	ifp->if_start_req = 0;
4371 	ifp->if_start_active = 0;
4372 
4373 #if SKYWALK
4374 	/*
4375 	 * Wakeup any waiters, e.g. any threads waiting to
4376 	 * detach the interface from the flowswitch, etc.
4377 	 */
4378 	if (ifp->if_start_waiters != 0) {
4379 		ifp->if_start_waiters = 0;
4380 		wakeup(&ifp->if_start_waiters);
4381 	}
4382 #endif /* SKYWALK */
4383 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4384 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4385 		struct timespec delay_start_ts;
4386 		struct timespec *ts;
4387 
4388 		/*
4389 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4390 		 * there are still packets in the send queue which haven't
4391 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4392 		 * until ifnet_start() is called again.
4393 		 */
4394 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4395 		    &ifp->if_start_cycle : NULL);
4396 
4397 		if (ts == NULL && ifp->if_start_delayed == 1) {
4398 			delay_start_ts.tv_sec = 0;
4399 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4400 			ts = &delay_start_ts;
4401 		}
4402 
4403 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4404 			ts = NULL;
4405 		}
4406 
4407 		if (__improbable(ts != NULL)) {
4408 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4409 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4410 		}
4411 
4412 		(void) assert_wait_deadline(&ifp->if_start_thread,
4413 		    THREAD_UNINT, deadline);
4414 		lck_mtx_unlock(&ifp->if_start_lock);
4415 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4416 		/* NOTREACHED */
4417 	} else {
4418 terminate:
4419 		/* interface is detached? */
4420 		ifnet_set_start_cycle(ifp, NULL);
4421 
4422 		/* clear if_start_thread to allow termination to continue */
4423 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4424 		ifp->if_start_thread = THREAD_NULL;
4425 		wakeup((caddr_t)&ifp->if_start_thread);
4426 		lck_mtx_unlock(&ifp->if_start_lock);
4427 
4428 		if (dlil_verbose) {
4429 			DLIL_PRINTF("%s: starter thread terminated\n",
4430 			    if_name(ifp));
4431 		}
4432 
4433 		/* for the extra refcnt from kernel_thread_start() */
4434 		thread_deallocate(current_thread());
4435 		/* this is the end */
4436 		thread_terminate(current_thread());
4437 		/* NOTREACHED */
4438 	}
4439 
4440 	/* must never get here */
4441 	VERIFY(0);
4442 	/* NOTREACHED */
4443 	__builtin_unreachable();
4444 }
4445 
4446 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4447 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4448 {
4449 	if (ts == NULL) {
4450 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4451 	} else {
4452 		*(&ifp->if_start_cycle) = *ts;
4453 	}
4454 
4455 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4456 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4457 		    if_name(ifp), ts->tv_nsec);
4458 	}
4459 }
4460 
4461 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4462 ifnet_poll_wakeup(struct ifnet *ifp)
4463 {
4464 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4465 
4466 	ifp->if_poll_req++;
4467 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4468 	    ifp->if_poll_thread != THREAD_NULL) {
4469 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4470 	}
4471 }
4472 
4473 void
ifnet_poll(struct ifnet * ifp)4474 ifnet_poll(struct ifnet *ifp)
4475 {
4476 	/*
4477 	 * If the poller thread is inactive, signal it to do work.
4478 	 */
4479 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4480 	ifnet_poll_wakeup(ifp);
4481 	lck_mtx_unlock(&ifp->if_poll_lock);
4482 }
4483 
4484 __attribute__((noreturn))
4485 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4486 ifnet_poll_thread_func(void *v, wait_result_t w)
4487 {
4488 #pragma unused(w)
4489 	char thread_name[MAXTHREADNAMESIZE];
4490 	struct ifnet *ifp = v;
4491 
4492 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4493 	VERIFY(current_thread() == ifp->if_poll_thread);
4494 
4495 	/* construct the name for this thread, and then apply it */
4496 	bzero(thread_name, sizeof(thread_name));
4497 	(void) snprintf(thread_name, sizeof(thread_name),
4498 	    "ifnet_poller_%s", ifp->if_xname);
4499 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4500 
4501 	lck_mtx_lock(&ifp->if_poll_lock);
4502 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4503 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4504 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4505 	/* wake up once to get out of embryonic state */
4506 	ifnet_poll_wakeup(ifp);
4507 	lck_mtx_unlock(&ifp->if_poll_lock);
4508 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4509 	/* NOTREACHED */
4510 	__builtin_unreachable();
4511 }
4512 
4513 __attribute__((noreturn))
4514 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4515 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4516 {
4517 	struct dlil_threading_info *inp;
4518 	struct ifnet *ifp = v;
4519 	struct ifnet_stat_increment_param s;
4520 	struct timespec start_time;
4521 
4522 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4523 
4524 	bzero(&s, sizeof(s));
4525 	net_timerclear(&start_time);
4526 
4527 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4528 	if (__improbable(wres == THREAD_INTERRUPTED ||
4529 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4530 		goto terminate;
4531 	}
4532 
4533 	inp = ifp->if_inp;
4534 	VERIFY(inp != NULL);
4535 
4536 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4537 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4538 		lck_mtx_unlock(&ifp->if_poll_lock);
4539 		ifnet_decr_pending_thread_count(ifp);
4540 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4541 		goto skip;
4542 	}
4543 
4544 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4545 
4546 	/*
4547 	 * Keep on servicing until no more request.
4548 	 */
4549 	for (;;) {
4550 		struct mbuf *m_head, *m_tail;
4551 		u_int32_t m_lim, m_cnt, m_totlen;
4552 		u_int16_t req = ifp->if_poll_req;
4553 
4554 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4555 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4556 		lck_mtx_unlock(&ifp->if_poll_lock);
4557 
4558 		/*
4559 		 * If no longer attached, there's nothing to do;
4560 		 * else hold an IO refcnt to prevent the interface
4561 		 * from being detached (will be released below.)
4562 		 */
4563 		if (!ifnet_is_attached(ifp, 1)) {
4564 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4565 			break;
4566 		}
4567 
4568 		if (dlil_verbose > 1) {
4569 			DLIL_PRINTF("%s: polling up to %d pkts, "
4570 			    "pkts avg %d max %d, wreq avg %d, "
4571 			    "bytes avg %d\n",
4572 			    if_name(ifp), m_lim,
4573 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4574 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4575 		}
4576 
4577 		/* invoke the driver's input poll routine */
4578 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4579 		&m_cnt, &m_totlen));
4580 
4581 		if (m_head != NULL) {
4582 			VERIFY(m_tail != NULL && m_cnt > 0);
4583 
4584 			if (dlil_verbose > 1) {
4585 				DLIL_PRINTF("%s: polled %d pkts, "
4586 				    "pkts avg %d max %d, wreq avg %d, "
4587 				    "bytes avg %d\n",
4588 				    if_name(ifp), m_cnt,
4589 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4590 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4591 			}
4592 
4593 			/* stats are required for extended variant */
4594 			s.packets_in = m_cnt;
4595 			s.bytes_in = m_totlen;
4596 
4597 			(void) ifnet_input_common(ifp, m_head, m_tail,
4598 			    &s, TRUE, TRUE);
4599 		} else {
4600 			if (dlil_verbose > 1) {
4601 				DLIL_PRINTF("%s: no packets, "
4602 				    "pkts avg %d max %d, wreq avg %d, "
4603 				    "bytes avg %d\n",
4604 				    if_name(ifp), ifp->if_rxpoll_pavg,
4605 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4606 				    ifp->if_rxpoll_bavg);
4607 			}
4608 
4609 			(void) ifnet_input_common(ifp, NULL, NULL,
4610 			    NULL, FALSE, TRUE);
4611 		}
4612 
4613 		/* Release the io ref count */
4614 		ifnet_decr_iorefcnt(ifp);
4615 
4616 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4617 
4618 		/* if there's no pending request, we're done */
4619 		if (req == ifp->if_poll_req ||
4620 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4621 			break;
4622 		}
4623 	}
4624 skip:
4625 	ifp->if_poll_req = 0;
4626 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4627 
4628 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4629 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4630 		struct timespec *ts;
4631 
4632 		/*
4633 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4634 		 * until ifnet_poll() is called again.
4635 		 */
4636 		ts = &ifp->if_poll_cycle;
4637 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4638 			ts = NULL;
4639 		}
4640 
4641 		if (ts != NULL) {
4642 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4643 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4644 		}
4645 
4646 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4647 		    THREAD_UNINT, deadline);
4648 		lck_mtx_unlock(&ifp->if_poll_lock);
4649 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4650 		/* NOTREACHED */
4651 	} else {
4652 terminate:
4653 		/* interface is detached (maybe while asleep)? */
4654 		ifnet_set_poll_cycle(ifp, NULL);
4655 
4656 		/* clear if_poll_thread to allow termination to continue */
4657 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4658 		ifp->if_poll_thread = THREAD_NULL;
4659 		wakeup((caddr_t)&ifp->if_poll_thread);
4660 		lck_mtx_unlock(&ifp->if_poll_lock);
4661 
4662 		if (dlil_verbose) {
4663 			DLIL_PRINTF("%s: poller thread terminated\n",
4664 			    if_name(ifp));
4665 		}
4666 
4667 		/* for the extra refcnt from kernel_thread_start() */
4668 		thread_deallocate(current_thread());
4669 		/* this is the end */
4670 		thread_terminate(current_thread());
4671 		/* NOTREACHED */
4672 	}
4673 
4674 	/* must never get here */
4675 	VERIFY(0);
4676 	/* NOTREACHED */
4677 	__builtin_unreachable();
4678 }
4679 
4680 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4681 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4682 {
4683 	if (ts == NULL) {
4684 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4685 	} else {
4686 		*(&ifp->if_poll_cycle) = *ts;
4687 	}
4688 
4689 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4690 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4691 		    if_name(ifp), ts->tv_nsec);
4692 	}
4693 }
4694 
4695 void
ifnet_purge(struct ifnet * ifp)4696 ifnet_purge(struct ifnet *ifp)
4697 {
4698 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4699 		if_qflush_snd(ifp, false);
4700 	}
4701 }
4702 
4703 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4704 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4705 {
4706 	IFCQ_LOCK_ASSERT_HELD(ifq);
4707 
4708 	if (!(IFCQ_IS_READY(ifq))) {
4709 		return;
4710 	}
4711 
4712 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4713 		struct tb_profile tb = {
4714 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4715 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4716 		};
4717 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4718 	}
4719 
4720 	ifclassq_update(ifq, ev);
4721 }
4722 
4723 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4724 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4725 {
4726 	switch (ev) {
4727 	case CLASSQ_EV_LINK_BANDWIDTH:
4728 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4729 			ifp->if_poll_update++;
4730 		}
4731 		break;
4732 
4733 	default:
4734 		break;
4735 	}
4736 }
4737 
4738 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4739 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4740 {
4741 	struct ifclassq *ifq;
4742 	u_int32_t omodel;
4743 	errno_t err;
4744 
4745 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4746 		return EINVAL;
4747 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4748 		return ENXIO;
4749 	}
4750 
4751 	ifq = ifp->if_snd;
4752 	IFCQ_LOCK(ifq);
4753 	omodel = ifp->if_output_sched_model;
4754 	ifp->if_output_sched_model = model;
4755 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4756 		ifp->if_output_sched_model = omodel;
4757 	}
4758 	IFCQ_UNLOCK(ifq);
4759 
4760 	return err;
4761 }
4762 
4763 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4764 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4765 {
4766 	if (ifp == NULL) {
4767 		return EINVAL;
4768 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4769 		return ENXIO;
4770 	}
4771 
4772 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4773 
4774 	return 0;
4775 }
4776 
4777 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4778 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4779 {
4780 	if (ifp == NULL || maxqlen == NULL) {
4781 		return EINVAL;
4782 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4783 		return ENXIO;
4784 	}
4785 
4786 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4787 
4788 	return 0;
4789 }
4790 
4791 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4792 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4793 {
4794 	errno_t err;
4795 
4796 	if (ifp == NULL || pkts == NULL) {
4797 		err = EINVAL;
4798 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4799 		err = ENXIO;
4800 	} else {
4801 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4802 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4803 	}
4804 
4805 	return err;
4806 }
4807 
4808 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4809 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4810     u_int32_t *pkts, u_int32_t *bytes)
4811 {
4812 	errno_t err;
4813 
4814 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4815 	    (pkts == NULL && bytes == NULL)) {
4816 		err = EINVAL;
4817 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4818 		err = ENXIO;
4819 	} else {
4820 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4821 		    pkts, bytes);
4822 	}
4823 
4824 	return err;
4825 }
4826 
4827 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4828 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4829 {
4830 	struct dlil_threading_info *inp;
4831 
4832 	if (ifp == NULL) {
4833 		return EINVAL;
4834 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4835 		return ENXIO;
4836 	}
4837 
4838 	if (maxqlen == 0) {
4839 		maxqlen = if_rcvq_maxlen;
4840 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4841 		maxqlen = IF_RCVQ_MINLEN;
4842 	}
4843 
4844 	inp = ifp->if_inp;
4845 	lck_mtx_lock(&inp->dlth_lock);
4846 	qlimit(&inp->dlth_pkts) = maxqlen;
4847 	lck_mtx_unlock(&inp->dlth_lock);
4848 
4849 	return 0;
4850 }
4851 
4852 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4853 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4854 {
4855 	struct dlil_threading_info *inp;
4856 
4857 	if (ifp == NULL || maxqlen == NULL) {
4858 		return EINVAL;
4859 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4860 		return ENXIO;
4861 	}
4862 
4863 	inp = ifp->if_inp;
4864 	lck_mtx_lock(&inp->dlth_lock);
4865 	*maxqlen = qlimit(&inp->dlth_pkts);
4866 	lck_mtx_unlock(&inp->dlth_lock);
4867 	return 0;
4868 }
4869 
4870 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4871 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4872     uint16_t delay_timeout)
4873 {
4874 	if (delay_qlen > 0 && delay_timeout > 0) {
4875 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4876 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4877 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4878 		/* convert timeout to nanoseconds */
4879 		ifp->if_start_delay_timeout *= 1000;
4880 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4881 		    ifp->if_xname, (uint32_t)delay_qlen,
4882 		    (uint32_t)delay_timeout);
4883 	} else {
4884 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4885 	}
4886 }
4887 
4888 /*
4889  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4890  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4891  * buf holds the full header.
4892  */
4893 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4894 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4895 {
4896 	struct ip *ip;
4897 	struct ip6_hdr *ip6;
4898 	uint8_t lbuf[64] __attribute__((aligned(8)));
4899 	uint8_t *p = buf;
4900 
4901 	if (ip_ver == IPVERSION) {
4902 		uint8_t old_tos;
4903 		uint32_t sum;
4904 
4905 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4906 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4907 			bcopy(buf, lbuf, sizeof(struct ip));
4908 			p = lbuf;
4909 		}
4910 		ip = (struct ip *)(void *)p;
4911 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4912 			return;
4913 		}
4914 
4915 		DTRACE_IP1(clear__v4, struct ip *, ip);
4916 		old_tos = ip->ip_tos;
4917 		ip->ip_tos &= IPTOS_ECN_MASK;
4918 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4919 		sum = (sum >> 16) + (sum & 0xffff);
4920 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4921 
4922 		if (__improbable(p == lbuf)) {
4923 			bcopy(lbuf, buf, sizeof(struct ip));
4924 		}
4925 	} else {
4926 		uint32_t flow;
4927 		ASSERT(ip_ver == IPV6_VERSION);
4928 
4929 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4930 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4931 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4932 			p = lbuf;
4933 		}
4934 		ip6 = (struct ip6_hdr *)(void *)p;
4935 		flow = ntohl(ip6->ip6_flow);
4936 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4937 			return;
4938 		}
4939 
4940 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4941 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4942 
4943 		if (__improbable(p == lbuf)) {
4944 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4945 		}
4946 	}
4947 }
4948 
4949 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4950 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4951     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4952 {
4953 #if SKYWALK
4954 	volatile struct sk_nexusadv *nxadv = NULL;
4955 #endif /* SKYWALK */
4956 	volatile uint64_t *fg_ts = NULL;
4957 	volatile uint64_t *rt_ts = NULL;
4958 	struct timespec now;
4959 	u_int64_t now_nsec = 0;
4960 	int error = 0;
4961 	uint8_t *mcast_buf = NULL;
4962 	uint8_t ip_ver;
4963 	uint32_t pktlen;
4964 
4965 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4966 #if SKYWALK
4967 	/*
4968 	 * If attached to flowswitch, grab pointers to the
4969 	 * timestamp variables in the nexus advisory region.
4970 	 */
4971 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4972 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4973 		fg_ts = &nxadv->nxadv_fg_sendts;
4974 		rt_ts = &nxadv->nxadv_rt_sendts;
4975 	}
4976 #endif /* SKYWALK */
4977 
4978 	/*
4979 	 * If packet already carries a timestamp, either from dlil_output()
4980 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4981 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4982 	 * the timestamp value is used internally there.
4983 	 */
4984 	switch (p->cp_ptype) {
4985 	case QP_MBUF:
4986 #if SKYWALK
4987 		/*
4988 		 * Valid only for non-native (compat) Skywalk interface.
4989 		 * If the data source uses packet, caller must convert
4990 		 * it to mbuf first prior to calling this routine.
4991 		 */
4992 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4993 #endif /* SKYWALK */
4994 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4995 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4996 
4997 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4998 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4999 			nanouptime(&now);
5000 			net_timernsec(&now, &now_nsec);
5001 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5002 		}
5003 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5004 		/*
5005 		 * If the packet service class is not background,
5006 		 * update the timestamp to indicate recent activity
5007 		 * on a foreground socket.
5008 		 */
5009 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5010 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5011 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5012 			    PKTF_SO_BACKGROUND)) {
5013 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5014 				if (fg_ts != NULL) {
5015 					*fg_ts = (uint32_t)_net_uptime;
5016 				}
5017 			}
5018 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5019 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5020 				if (rt_ts != NULL) {
5021 					*rt_ts = (uint32_t)_net_uptime;
5022 				}
5023 			}
5024 		}
5025 		pktlen = m_pktlen(p->cp_mbuf);
5026 
5027 		/*
5028 		 * Some Wi-Fi AP implementations do not correctly handle
5029 		 * multicast IP packets with DSCP bits set (radr://9331522).
5030 		 * As a workaround we clear the DSCP bits but keep service
5031 		 * class (rdar://51507725).
5032 		 */
5033 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5034 		    IFNET_IS_WIFI_INFRA(ifp)) {
5035 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5036 			struct ether_header *eh;
5037 			boolean_t pullup = FALSE;
5038 			uint16_t etype;
5039 
5040 			if (__improbable(len < sizeof(struct ether_header))) {
5041 				DTRACE_IP1(small__ether, size_t, len);
5042 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5043 				    sizeof(struct ether_header))) == NULL) {
5044 					return ENOMEM;
5045 				}
5046 			}
5047 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5048 			etype = ntohs(eh->ether_type);
5049 			if (etype == ETHERTYPE_IP) {
5050 				hlen = sizeof(struct ether_header) +
5051 				    sizeof(struct ip);
5052 				if (len < hlen) {
5053 					DTRACE_IP1(small__v4, size_t, len);
5054 					pullup = TRUE;
5055 				}
5056 				ip_ver = IPVERSION;
5057 			} else if (etype == ETHERTYPE_IPV6) {
5058 				hlen = sizeof(struct ether_header) +
5059 				    sizeof(struct ip6_hdr);
5060 				if (len < hlen) {
5061 					DTRACE_IP1(small__v6, size_t, len);
5062 					pullup = TRUE;
5063 				}
5064 				ip_ver = IPV6_VERSION;
5065 			} else {
5066 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5067 				break;
5068 			}
5069 			if (pullup) {
5070 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5071 				    NULL) {
5072 					return ENOMEM;
5073 				}
5074 
5075 				eh = (struct ether_header *)mbuf_data(
5076 					p->cp_mbuf);
5077 			}
5078 			mcast_buf = (uint8_t *)(eh + 1);
5079 			/*
5080 			 * ifnet_mcast_clear_dscp() will finish the work below.
5081 			 * Note that the pullups above ensure that mcast_buf
5082 			 * points to a full IP header.
5083 			 */
5084 		}
5085 		break;
5086 
5087 #if SKYWALK
5088 	case QP_PACKET:
5089 		/*
5090 		 * Valid only for native Skywalk interface.  If the data
5091 		 * source uses mbuf, caller must convert it to packet first
5092 		 * prior to calling this routine.
5093 		 */
5094 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5095 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5096 		    p->cp_kpkt->pkt_timestamp == 0) {
5097 			nanouptime(&now);
5098 			net_timernsec(&now, &now_nsec);
5099 			p->cp_kpkt->pkt_timestamp = now_nsec;
5100 		}
5101 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5102 		/*
5103 		 * If the packet service class is not background,
5104 		 * update the timestamps on the interface, as well as
5105 		 * the ones in nexus-wide advisory to indicate recent
5106 		 * activity on a foreground flow.
5107 		 */
5108 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5109 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5110 			if (fg_ts != NULL) {
5111 				*fg_ts = (uint32_t)_net_uptime;
5112 			}
5113 		}
5114 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5115 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5116 			if (rt_ts != NULL) {
5117 				*rt_ts = (uint32_t)_net_uptime;
5118 			}
5119 		}
5120 		pktlen = p->cp_kpkt->pkt_length;
5121 
5122 		/*
5123 		 * Some Wi-Fi AP implementations do not correctly handle
5124 		 * multicast IP packets with DSCP bits set (radr://9331522).
5125 		 * As a workaround we clear the DSCP bits but keep service
5126 		 * class (rdar://51507725).
5127 		 */
5128 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5129 		    IFNET_IS_WIFI_INFRA(ifp)) {
5130 			uint8_t *baddr;
5131 			struct ether_header *eh;
5132 			uint16_t etype;
5133 
5134 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5135 			baddr += p->cp_kpkt->pkt_headroom;
5136 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5137 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5138 				    p->cp_kpkt);
5139 				break;
5140 			}
5141 			eh = (struct ether_header *)(void *)baddr;
5142 			etype = ntohs(eh->ether_type);
5143 			if (etype == ETHERTYPE_IP) {
5144 				if (pktlen < sizeof(struct ether_header) +
5145 				    sizeof(struct ip)) {
5146 					DTRACE_IP1(pkt__small__v4, uint32_t,
5147 					    pktlen);
5148 					break;
5149 				}
5150 				ip_ver = IPVERSION;
5151 			} else if (etype == ETHERTYPE_IPV6) {
5152 				if (pktlen < sizeof(struct ether_header) +
5153 				    sizeof(struct ip6_hdr)) {
5154 					DTRACE_IP1(pkt__small__v6, uint32_t,
5155 					    pktlen);
5156 					break;
5157 				}
5158 				ip_ver = IPV6_VERSION;
5159 			} else {
5160 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5161 				    etype);
5162 				break;
5163 			}
5164 			mcast_buf = (uint8_t *)(eh + 1);
5165 			/*
5166 			 * ifnet_mcast_clear_dscp() will finish the work below.
5167 			 * The checks above verify that the IP header is in the
5168 			 * first buflet.
5169 			 */
5170 		}
5171 		break;
5172 #endif /* SKYWALK */
5173 
5174 	default:
5175 		VERIFY(0);
5176 		/* NOTREACHED */
5177 		__builtin_unreachable();
5178 	}
5179 
5180 	if (mcast_buf != NULL) {
5181 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5182 	}
5183 
5184 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5185 		if (now_nsec == 0) {
5186 			nanouptime(&now);
5187 			net_timernsec(&now, &now_nsec);
5188 		}
5189 		/*
5190 		 * If the driver chose to delay start callback for
5191 		 * coalescing multiple packets, Then use the following
5192 		 * heuristics to make sure that start callback will
5193 		 * be delayed only when bulk data transfer is detected.
5194 		 * 1. number of packets enqueued in (delay_win * 2) is
5195 		 * greater than or equal to the delay qlen.
5196 		 * 2. If delay_start is enabled it will stay enabled for
5197 		 * another 10 idle windows. This is to take into account
5198 		 * variable RTT and burst traffic.
5199 		 * 3. If the time elapsed since last enqueue is more
5200 		 * than 200ms we disable delaying start callback. This is
5201 		 * is to take idle time into account.
5202 		 */
5203 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5204 		if (ifp->if_start_delay_swin > 0) {
5205 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5206 				ifp->if_start_delay_cnt++;
5207 			} else if ((now_nsec - ifp->if_start_delay_swin)
5208 			    >= (200 * 1000 * 1000)) {
5209 				ifp->if_start_delay_swin = now_nsec;
5210 				ifp->if_start_delay_cnt = 1;
5211 				ifp->if_start_delay_idle = 0;
5212 				if (ifp->if_eflags & IFEF_DELAY_START) {
5213 					if_clear_eflags(ifp, IFEF_DELAY_START);
5214 					ifnet_delay_start_disabled_increment();
5215 				}
5216 			} else {
5217 				if (ifp->if_start_delay_cnt >=
5218 				    ifp->if_start_delay_qlen) {
5219 					if_set_eflags(ifp, IFEF_DELAY_START);
5220 					ifp->if_start_delay_idle = 0;
5221 				} else {
5222 					if (ifp->if_start_delay_idle >= 10) {
5223 						if_clear_eflags(ifp,
5224 						    IFEF_DELAY_START);
5225 						ifnet_delay_start_disabled_increment();
5226 					} else {
5227 						ifp->if_start_delay_idle++;
5228 					}
5229 				}
5230 				ifp->if_start_delay_swin = now_nsec;
5231 				ifp->if_start_delay_cnt = 1;
5232 			}
5233 		} else {
5234 			ifp->if_start_delay_swin = now_nsec;
5235 			ifp->if_start_delay_cnt = 1;
5236 			ifp->if_start_delay_idle = 0;
5237 			if_clear_eflags(ifp, IFEF_DELAY_START);
5238 		}
5239 	} else {
5240 		if_clear_eflags(ifp, IFEF_DELAY_START);
5241 	}
5242 
5243 	/* enqueue the packet (caller consumes object) */
5244 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5245 	    1, pktlen, pdrop);
5246 
5247 	/*
5248 	 * Tell the driver to start dequeueing; do this even when the queue
5249 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5250 	 * be dequeueing from other unsuspended queues.
5251 	 */
5252 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5253 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5254 		ifnet_start(ifp);
5255 	}
5256 
5257 	return error;
5258 }
5259 
5260 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5261 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5262     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5263     boolean_t flush, boolean_t *pdrop)
5264 {
5265 	int error;
5266 
5267 	/* enqueue the packet (caller consumes object) */
5268 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5269 	    cnt, bytes, pdrop);
5270 
5271 	/*
5272 	 * Tell the driver to start dequeueing; do this even when the queue
5273 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5274 	 * be dequeueing from other unsuspended queues.
5275 	 */
5276 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5277 		ifnet_start(ifp);
5278 	}
5279 	return error;
5280 }
5281 
5282 #if DEVELOPMENT || DEBUG
5283 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5284 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5285 {
5286 #define MIN_TRACE_DUMP_PKT_SIZE  32
5287 	struct ether_header *eh = NULL;
5288 	struct udphdr *uh = NULL;
5289 
5290 	if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5291 	    flow_key_trace.fk_ipver != IPV6_VERSION))) {
5292 		return;
5293 	}
5294 
5295 	uint16_t bdlim, bdlen, bdoff;
5296 	uint8_t *baddr;
5297 
5298 	MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5299 
5300 	if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5301 		if (!IFNET_IS_ETHERNET(ifp)) {
5302 			return;
5303 		}
5304 
5305 		sa_family_t af = AF_UNSPEC;
5306 		ASSERT(kpkt->pkt_l2_len > 0);
5307 
5308 		baddr += kpkt->pkt_headroom;
5309 		eh = (struct ether_header *)(void *)baddr;
5310 		if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5311 			return;
5312 		}
5313 		if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5314 			return;
5315 		}
5316 		uint16_t ether_type = ntohs(eh->ether_type);
5317 		if (ether_type == ETHERTYPE_IP) {
5318 			af = AF_INET;
5319 		} else if (ether_type == ETHERTYPE_IPV6) {
5320 			af = AF_INET6;
5321 		} else {
5322 			return;
5323 		}
5324 		flow_pkt_classify(kpkt, ifp, af, input);
5325 	}
5326 
5327 	if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5328 		return;
5329 	}
5330 
5331 	if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5332 		return;
5333 	}
5334 
5335 	uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5336 	uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5337 
5338 	if (kpkt->pkt_flow_udp_src != sport ||
5339 	    kpkt->pkt_flow_udp_dst != dport) {
5340 		return;
5341 	}
5342 
5343 	if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5344 		struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5345 		struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5346 		struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5347 
5348 		if (ip_header->ip_src.s_addr != saddr->s_addr ||
5349 		    ip_header->ip_dst.s_addr != daddr->s_addr) {
5350 			return;
5351 		}
5352 	} else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5353 		struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5354 		struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5355 		struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5356 
5357 		if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5358 		    !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5359 			return;
5360 		}
5361 	}
5362 
5363 	int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5364 
5365 	uint16_t pkt_payload_len = bdlim - bdoff;
5366 	pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5367 	pkt_payload_len -= udp_payload_offset;
5368 
5369 	if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5370 		uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5371 		uint8_t *payload = (uint8_t *)(uh + 1);
5372 
5373 		/* Trace 32 bytes of UDP transport payload */
5374 		uint64_t *trace1 = __DECONST(uint64_t *, payload);
5375 		uint64_t *trace2 = trace1 + 1;
5376 		uint64_t *trace3 = trace2 + 1;
5377 		uint64_t *trace4 = trace3 + 1;
5378 
5379 		if (input) {
5380 			KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5381 		} else {
5382 			KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5383 		}
5384 	}
5385 }
5386 #endif /* DEVELOPMENT || DEBUG */
5387 
5388 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5389 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5390 {
5391 	struct ifnet *ifp = handle;
5392 	boolean_t pdrop;        /* dummy */
5393 	uint32_t i;
5394 
5395 	ASSERT(n_pkts >= 1);
5396 	for (i = 0; i < n_pkts - 1; i++) {
5397 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5398 		    FALSE, &pdrop);
5399 	}
5400 	/* flush with the last packet */
5401 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5402 	    TRUE, &pdrop);
5403 
5404 	return 0;
5405 }
5406 
5407 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5408 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5409     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5410 {
5411 #if DEVELOPMENT || DEBUG
5412 	switch (pkt->cp_ptype) {
5413 	case QP_PACKET: {
5414 		trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5415 		break;
5416 	}
5417 	case QP_MBUF:
5418 	case QP_INVALID: {
5419 		break;
5420 	}
5421 	}
5422 #endif /* DEVELOPMENT || DEBUG */
5423 
5424 	if (ifp->if_output_netem != NULL) {
5425 		bool drop;
5426 		errno_t error;
5427 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5428 		*pdrop = drop ? TRUE : FALSE;
5429 		return error;
5430 	} else {
5431 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5432 	}
5433 }
5434 
5435 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5436 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5437 {
5438 	boolean_t pdrop;
5439 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5440 }
5441 
5442 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5443 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5444     boolean_t *pdrop)
5445 {
5446 	classq_pkt_t pkt;
5447 
5448 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5449 	    m->m_nextpkt != NULL) {
5450 		if (m != NULL) {
5451 			m_freem_list(m);
5452 			*pdrop = TRUE;
5453 		}
5454 		return EINVAL;
5455 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5456 	    !IF_FULLY_ATTACHED(ifp)) {
5457 		/* flag tested without lock for performance */
5458 		m_freem(m);
5459 		*pdrop = TRUE;
5460 		return ENXIO;
5461 	} else if (!(ifp->if_flags & IFF_UP)) {
5462 		m_freem(m);
5463 		*pdrop = TRUE;
5464 		return ENETDOWN;
5465 	}
5466 
5467 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5468 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5469 }
5470 
5471 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5472 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5473     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5474     boolean_t *pdrop)
5475 {
5476 	classq_pkt_t head, tail;
5477 
5478 	ASSERT(m_head != NULL);
5479 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5480 	ASSERT(m_tail != NULL);
5481 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5482 	ASSERT(ifp != NULL);
5483 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5484 
5485 	if (!IF_FULLY_ATTACHED(ifp)) {
5486 		/* flag tested without lock for performance */
5487 		m_freem_list(m_head);
5488 		*pdrop = TRUE;
5489 		return ENXIO;
5490 	} else if (!(ifp->if_flags & IFF_UP)) {
5491 		m_freem_list(m_head);
5492 		*pdrop = TRUE;
5493 		return ENETDOWN;
5494 	}
5495 
5496 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5497 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5498 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5499 	           flush, pdrop);
5500 }
5501 
5502 #if SKYWALK
5503 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5504 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5505     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5506 {
5507 	classq_pkt_t pkt;
5508 
5509 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5510 
5511 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5512 		if (kpkt != NULL) {
5513 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5514 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5515 			*pdrop = TRUE;
5516 		}
5517 		return EINVAL;
5518 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5519 	    !IF_FULLY_ATTACHED(ifp))) {
5520 		/* flag tested without lock for performance */
5521 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5522 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5523 		*pdrop = TRUE;
5524 		return ENXIO;
5525 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5526 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5527 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5528 		*pdrop = TRUE;
5529 		return ENETDOWN;
5530 	}
5531 
5532 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5533 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5534 }
5535 
5536 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5537 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5538     boolean_t flush, boolean_t *pdrop)
5539 {
5540 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5541 }
5542 
5543 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5544 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5545     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5546 {
5547 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5548 }
5549 
5550 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5551 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5552     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5553     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5554 {
5555 	classq_pkt_t head, tail;
5556 
5557 	ASSERT(k_head != NULL);
5558 	ASSERT(k_tail != NULL);
5559 	ASSERT(ifp != NULL);
5560 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5561 
5562 	if (!IF_FULLY_ATTACHED(ifp)) {
5563 		/* flag tested without lock for performance */
5564 		pp_free_packet_chain(k_head, NULL);
5565 		*pdrop = TRUE;
5566 		return ENXIO;
5567 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5568 		pp_free_packet_chain(k_head, NULL);
5569 		*pdrop = TRUE;
5570 		return ENETDOWN;
5571 	}
5572 
5573 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5574 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5575 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5576 	           flush, pdrop);
5577 }
5578 
5579 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5580 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5581     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5582     boolean_t *pdrop)
5583 {
5584 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5585 	           cnt, bytes, flush, pdrop);
5586 }
5587 
5588 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5589 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5590     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5591     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5592 {
5593 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5594 	           cnt, bytes, flush, pdrop);
5595 }
5596 #endif /* SKYWALK */
5597 
5598 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5599 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5600 {
5601 	errno_t rc;
5602 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5603 
5604 	if (ifp == NULL || mp == NULL) {
5605 		return EINVAL;
5606 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5607 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5608 		return ENXIO;
5609 	}
5610 	if (!ifnet_is_attached(ifp, 1)) {
5611 		return ENXIO;
5612 	}
5613 
5614 #if SKYWALK
5615 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5616 #endif /* SKYWALK */
5617 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5618 	    &pkt, NULL, NULL, NULL, 0);
5619 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5620 	ifnet_decr_iorefcnt(ifp);
5621 	*mp = pkt.cp_mbuf;
5622 	return rc;
5623 }
5624 
5625 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5626 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5627     struct mbuf **mp)
5628 {
5629 	errno_t rc;
5630 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5631 
5632 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5633 		return EINVAL;
5634 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5635 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5636 		return ENXIO;
5637 	}
5638 	if (!ifnet_is_attached(ifp, 1)) {
5639 		return ENXIO;
5640 	}
5641 
5642 #if SKYWALK
5643 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5644 #endif /* SKYWALK */
5645 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5646 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5647 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5648 	ifnet_decr_iorefcnt(ifp);
5649 	*mp = pkt.cp_mbuf;
5650 	return rc;
5651 }
5652 
5653 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5654 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5655     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5656 {
5657 	errno_t rc;
5658 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5659 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5660 
5661 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5662 		return EINVAL;
5663 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5664 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5665 		return ENXIO;
5666 	}
5667 	if (!ifnet_is_attached(ifp, 1)) {
5668 		return ENXIO;
5669 	}
5670 
5671 #if SKYWALK
5672 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5673 #endif /* SKYWALK */
5674 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5675 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5676 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5677 	ifnet_decr_iorefcnt(ifp);
5678 	*head = pkt_head.cp_mbuf;
5679 	if (tail != NULL) {
5680 		*tail = pkt_tail.cp_mbuf;
5681 	}
5682 	return rc;
5683 }
5684 
5685 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5686 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5687     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5688 {
5689 	errno_t rc;
5690 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5691 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5692 
5693 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5694 		return EINVAL;
5695 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5696 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5697 		return ENXIO;
5698 	}
5699 	if (!ifnet_is_attached(ifp, 1)) {
5700 		return ENXIO;
5701 	}
5702 
5703 #if SKYWALK
5704 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5705 #endif /* SKYWALK */
5706 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5707 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5708 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5709 	ifnet_decr_iorefcnt(ifp);
5710 	*head = pkt_head.cp_mbuf;
5711 	if (tail != NULL) {
5712 		*tail = pkt_tail.cp_mbuf;
5713 	}
5714 	return rc;
5715 }
5716 
5717 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5718 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5719     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5720     u_int32_t *len)
5721 {
5722 	errno_t rc;
5723 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5724 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5725 
5726 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5727 	    !MBUF_VALID_SC(sc)) {
5728 		return EINVAL;
5729 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5730 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5731 		return ENXIO;
5732 	}
5733 	if (!ifnet_is_attached(ifp, 1)) {
5734 		return ENXIO;
5735 	}
5736 
5737 #if SKYWALK
5738 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5739 #endif /* SKYWALK */
5740 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5741 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5742 	    cnt, len, 0);
5743 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5744 	ifnet_decr_iorefcnt(ifp);
5745 	*head = pkt_head.cp_mbuf;
5746 	if (tail != NULL) {
5747 		*tail = pkt_tail.cp_mbuf;
5748 	}
5749 	return rc;
5750 }
5751 
5752 #if XNU_TARGET_OS_OSX
5753 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5754 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5755     const struct sockaddr *dest, const char *dest_linkaddr,
5756     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5757 {
5758 	if (pre != NULL) {
5759 		*pre = 0;
5760 	}
5761 	if (post != NULL) {
5762 		*post = 0;
5763 	}
5764 
5765 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5766 }
5767 #endif /* XNU_TARGET_OS_OSX */
5768 
5769 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5770 packet_has_vlan_tag(struct mbuf * m)
5771 {
5772 	u_int   tag = 0;
5773 
5774 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5775 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5776 		if (tag == 0) {
5777 			/* the packet is just priority-tagged, clear the bit */
5778 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5779 		}
5780 	}
5781 	return tag != 0;
5782 }
5783 
5784 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5785 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5786     char **frame_header_p, protocol_family_t protocol_family)
5787 {
5788 	boolean_t               is_vlan_packet = FALSE;
5789 	struct ifnet_filter     *filter;
5790 	struct mbuf             *m = *m_p;
5791 
5792 	is_vlan_packet = packet_has_vlan_tag(m);
5793 
5794 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5795 		return 0;
5796 	}
5797 
5798 	/*
5799 	 * Pass the inbound packet to the interface filters
5800 	 */
5801 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5802 	/* prevent filter list from changing in case we drop the lock */
5803 	if_flt_monitor_busy(ifp);
5804 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5805 		int result;
5806 
5807 		/* exclude VLAN packets from external filters PR-3586856 */
5808 		if (is_vlan_packet &&
5809 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5810 			continue;
5811 		}
5812 
5813 		if (!filter->filt_skip && filter->filt_input != NULL &&
5814 		    (filter->filt_protocol == 0 ||
5815 		    filter->filt_protocol == protocol_family)) {
5816 			lck_mtx_unlock(&ifp->if_flt_lock);
5817 
5818 			result = (*filter->filt_input)(filter->filt_cookie,
5819 			    ifp, protocol_family, m_p, frame_header_p);
5820 
5821 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5822 			if (result != 0) {
5823 				/* we're done with the filter list */
5824 				if_flt_monitor_unbusy(ifp);
5825 				lck_mtx_unlock(&ifp->if_flt_lock);
5826 				return result;
5827 			}
5828 		}
5829 	}
5830 	/* we're done with the filter list */
5831 	if_flt_monitor_unbusy(ifp);
5832 	lck_mtx_unlock(&ifp->if_flt_lock);
5833 
5834 	/*
5835 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5836 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5837 	 */
5838 	if (*m_p != NULL) {
5839 		(*m_p)->m_flags &= ~M_PROTO1;
5840 	}
5841 
5842 	return 0;
5843 }
5844 
5845 __attribute__((noinline))
5846 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5847 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5848     protocol_family_t protocol_family)
5849 {
5850 	boolean_t               is_vlan_packet;
5851 	struct ifnet_filter     *filter;
5852 	struct mbuf             *m = *m_p;
5853 
5854 	is_vlan_packet = packet_has_vlan_tag(m);
5855 
5856 	/*
5857 	 * Pass the outbound packet to the interface filters
5858 	 */
5859 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5860 	/* prevent filter list from changing in case we drop the lock */
5861 	if_flt_monitor_busy(ifp);
5862 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5863 		int result;
5864 
5865 		/* exclude VLAN packets from external filters PR-3586856 */
5866 		if (is_vlan_packet &&
5867 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5868 			continue;
5869 		}
5870 
5871 		if (!filter->filt_skip && filter->filt_output != NULL &&
5872 		    (filter->filt_protocol == 0 ||
5873 		    filter->filt_protocol == protocol_family)) {
5874 			lck_mtx_unlock(&ifp->if_flt_lock);
5875 
5876 			result = filter->filt_output(filter->filt_cookie, ifp,
5877 			    protocol_family, m_p);
5878 
5879 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5880 			if (result != 0) {
5881 				/* we're done with the filter list */
5882 				if_flt_monitor_unbusy(ifp);
5883 				lck_mtx_unlock(&ifp->if_flt_lock);
5884 				return result;
5885 			}
5886 		}
5887 	}
5888 	/* we're done with the filter list */
5889 	if_flt_monitor_unbusy(ifp);
5890 	lck_mtx_unlock(&ifp->if_flt_lock);
5891 
5892 	return 0;
5893 }
5894 
5895 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5896 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5897 {
5898 	int error;
5899 
5900 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5901 		/* Version 1 protocols get one packet at a time */
5902 		while (m != NULL) {
5903 			char *  frame_header;
5904 			mbuf_t  next_packet;
5905 
5906 			next_packet = m->m_nextpkt;
5907 			m->m_nextpkt = NULL;
5908 			frame_header = m->m_pkthdr.pkt_hdr;
5909 			m->m_pkthdr.pkt_hdr = NULL;
5910 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5911 			    ifproto->protocol_family, m, frame_header);
5912 			if (error != 0 && error != EJUSTRETURN) {
5913 				m_freem(m);
5914 			}
5915 			m = next_packet;
5916 		}
5917 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5918 		/* Version 2 protocols support packet lists */
5919 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5920 		    ifproto->protocol_family, m);
5921 		if (error != 0 && error != EJUSTRETURN) {
5922 			m_freem_list(m);
5923 		}
5924 	}
5925 }
5926 
5927 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5928 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5929     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5930 {
5931 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5932 
5933 	if (s->packets_in != 0) {
5934 		d->packets_in += s->packets_in;
5935 	}
5936 	if (s->bytes_in != 0) {
5937 		d->bytes_in += s->bytes_in;
5938 	}
5939 	if (s->errors_in != 0) {
5940 		d->errors_in += s->errors_in;
5941 	}
5942 
5943 	if (s->packets_out != 0) {
5944 		d->packets_out += s->packets_out;
5945 	}
5946 	if (s->bytes_out != 0) {
5947 		d->bytes_out += s->bytes_out;
5948 	}
5949 	if (s->errors_out != 0) {
5950 		d->errors_out += s->errors_out;
5951 	}
5952 
5953 	if (s->collisions != 0) {
5954 		d->collisions += s->collisions;
5955 	}
5956 	if (s->dropped != 0) {
5957 		d->dropped += s->dropped;
5958 	}
5959 
5960 	if (poll) {
5961 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5962 	}
5963 }
5964 
5965 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5966 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5967 {
5968 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5969 
5970 	/*
5971 	 * Use of atomic operations is unavoidable here because
5972 	 * these stats may also be incremented elsewhere via KPIs.
5973 	 */
5974 	if (s->packets_in != 0) {
5975 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5976 		s->packets_in = 0;
5977 	}
5978 	if (s->bytes_in != 0) {
5979 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5980 		s->bytes_in = 0;
5981 	}
5982 	if (s->errors_in != 0) {
5983 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5984 		s->errors_in = 0;
5985 	}
5986 
5987 	if (s->packets_out != 0) {
5988 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5989 		s->packets_out = 0;
5990 	}
5991 	if (s->bytes_out != 0) {
5992 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5993 		s->bytes_out = 0;
5994 	}
5995 	if (s->errors_out != 0) {
5996 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5997 		s->errors_out = 0;
5998 	}
5999 
6000 	if (s->collisions != 0) {
6001 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
6002 		s->collisions = 0;
6003 	}
6004 	if (s->dropped != 0) {
6005 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6006 		s->dropped = 0;
6007 	}
6008 
6009 	/*
6010 	 * No need for atomic operations as they are modified here
6011 	 * only from within the DLIL input thread context.
6012 	 */
6013 	if (ifp->if_poll_tstats.packets != 0) {
6014 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6015 		ifp->if_poll_tstats.packets = 0;
6016 	}
6017 	if (ifp->if_poll_tstats.bytes != 0) {
6018 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6019 		ifp->if_poll_tstats.bytes = 0;
6020 	}
6021 
6022 	return ifp->if_data_threshold != 0;
6023 }
6024 
6025 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6026 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6027 {
6028 	return dlil_input_packet_list_common(ifp, m, 0,
6029 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6030 }
6031 
6032 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6033 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6034     u_int32_t cnt, ifnet_model_t mode)
6035 {
6036 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6037 }
6038 
6039 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6040 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6041     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6042 {
6043 	int error = 0;
6044 	protocol_family_t protocol_family;
6045 	mbuf_t next_packet;
6046 	ifnet_t ifp = ifp_param;
6047 	char *frame_header = NULL;
6048 	struct if_proto *last_ifproto = NULL;
6049 	mbuf_t pkt_first = NULL;
6050 	mbuf_t *pkt_next = NULL;
6051 	u_int32_t poll_thresh = 0, poll_ival = 0;
6052 	int iorefcnt = 0;
6053 
6054 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6055 
6056 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6057 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6058 		poll_thresh = cnt;
6059 	}
6060 
6061 	while (m != NULL) {
6062 		struct if_proto *ifproto = NULL;
6063 		uint32_t pktf_mask;     /* pkt flags to preserve */
6064 
6065 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6066 
6067 		if (ifp_param == NULL) {
6068 			ifp = m->m_pkthdr.rcvif;
6069 		}
6070 
6071 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6072 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6073 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6074 			ifnet_poll(ifp);
6075 		}
6076 
6077 		/* Check if this mbuf looks valid */
6078 		MBUF_INPUT_CHECK(m, ifp);
6079 
6080 		next_packet = m->m_nextpkt;
6081 		m->m_nextpkt = NULL;
6082 		frame_header = m->m_pkthdr.pkt_hdr;
6083 		m->m_pkthdr.pkt_hdr = NULL;
6084 
6085 		/*
6086 		 * Get an IO reference count if the interface is not
6087 		 * loopback (lo0) and it is attached; lo0 never goes
6088 		 * away, so optimize for that.
6089 		 */
6090 		if (ifp != lo_ifp) {
6091 			/* iorefcnt is 0 if it hasn't been taken yet */
6092 			if (iorefcnt == 0) {
6093 				if (!ifnet_datamov_begin(ifp)) {
6094 					m_freem(m);
6095 					goto next;
6096 				}
6097 			}
6098 			iorefcnt = 1;
6099 			/*
6100 			 * Preserve the time stamp and skip pktap flags.
6101 			 */
6102 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6103 		} else {
6104 			/*
6105 			 * If this arrived on lo0, preserve interface addr
6106 			 * info to allow for connectivity between loopback
6107 			 * and local interface addresses.
6108 			 */
6109 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6110 		}
6111 		pktf_mask |= PKTF_WAKE_PKT;
6112 
6113 		/* make sure packet comes in clean */
6114 		m_classifier_init(m, pktf_mask);
6115 
6116 		ifp_inc_traffic_class_in(ifp, m);
6117 
6118 		/* find which protocol family this packet is for */
6119 		ifnet_lock_shared(ifp);
6120 		error = (*ifp->if_demux)(ifp, m, frame_header,
6121 		    &protocol_family);
6122 		ifnet_lock_done(ifp);
6123 		if (error != 0) {
6124 			if (error == EJUSTRETURN) {
6125 				goto next;
6126 			}
6127 			protocol_family = 0;
6128 		}
6129 
6130 #if (DEVELOPMENT || DEBUG)
6131 		/*
6132 		 * For testing we do not care about broadcast and multicast packets as
6133 		 * they are not as controllable as unicast traffic
6134 		 */
6135 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6136 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6137 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6138 				/*
6139 				 * This is a one-shot command
6140 				 */
6141 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6142 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6143 			}
6144 		}
6145 #endif /* (DEVELOPMENT || DEBUG) */
6146 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6147 			char buffer[64];
6148 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6149 
6150 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6151 			    ifp->if_xname, m_pktlen(m));
6152 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6153 				log_hexdump(buffer, buflen);
6154 			}
6155 		}
6156 
6157 		pktap_input(ifp, protocol_family, m, frame_header);
6158 
6159 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6160 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6161 		    ifp->if_type == IFT_CELLULAR) {
6162 			m_freem(m);
6163 			ip6stat.ip6s_clat464_in_v4_drop++;
6164 			goto next;
6165 		}
6166 
6167 		/* Translate the packet if it is received on CLAT interface */
6168 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6169 		    && dlil_is_clat_needed(protocol_family, m)) {
6170 			char *data = NULL;
6171 			struct ether_header eh;
6172 			struct ether_header *ehp = NULL;
6173 
6174 			if (ifp->if_type == IFT_ETHER) {
6175 				ehp = (struct ether_header *)(void *)frame_header;
6176 				/* Skip RX Ethernet packets if they are not IPV6 */
6177 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6178 					goto skip_clat;
6179 				}
6180 
6181 				/* Keep a copy of frame_header for Ethernet packets */
6182 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6183 			}
6184 			error = dlil_clat64(ifp, &protocol_family, &m);
6185 			data = (char *) mbuf_data(m);
6186 			if (error != 0) {
6187 				m_freem(m);
6188 				ip6stat.ip6s_clat464_in_drop++;
6189 				goto next;
6190 			}
6191 			/* Native v6 should be No-op */
6192 			if (protocol_family != PF_INET) {
6193 				goto skip_clat;
6194 			}
6195 
6196 			/* Do this only for translated v4 packets. */
6197 			switch (ifp->if_type) {
6198 			case IFT_CELLULAR:
6199 				frame_header = data;
6200 				break;
6201 			case IFT_ETHER:
6202 				/*
6203 				 * Drop if the mbuf doesn't have enough
6204 				 * space for Ethernet header
6205 				 */
6206 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6207 					m_free(m);
6208 					ip6stat.ip6s_clat464_in_drop++;
6209 					goto next;
6210 				}
6211 				/*
6212 				 * Set the frame_header ETHER_HDR_LEN bytes
6213 				 * preceeding the data pointer. Change
6214 				 * the ether_type too.
6215 				 */
6216 				frame_header = data - ETHER_HDR_LEN;
6217 				eh.ether_type = htons(ETHERTYPE_IP);
6218 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6219 				break;
6220 			}
6221 		}
6222 skip_clat:
6223 		/*
6224 		 * Match the wake packet against the list of ports that has been
6225 		 * been queried by the driver before the device went to sleep
6226 		 */
6227 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6228 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6229 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6230 			}
6231 		}
6232 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6233 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6234 			dlil_input_cksum_dbg(ifp, m, frame_header,
6235 			    protocol_family);
6236 		}
6237 		/*
6238 		 * For partial checksum offload, we expect the driver to
6239 		 * set the start offset indicating the start of the span
6240 		 * that is covered by the hardware-computed checksum;
6241 		 * adjust this start offset accordingly because the data
6242 		 * pointer has been advanced beyond the link-layer header.
6243 		 *
6244 		 * Virtual lan types (bridge, vlan, bond) can call
6245 		 * dlil_input_packet_list() with the same packet with the
6246 		 * checksum flags set. Set a flag indicating that the
6247 		 * adjustment has already been done.
6248 		 */
6249 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6250 			/* adjustment has already been done */
6251 		} else if ((m->m_pkthdr.csum_flags &
6252 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6253 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6254 			int adj;
6255 			if (frame_header == NULL ||
6256 			    frame_header < (char *)mbuf_datastart(m) ||
6257 			    frame_header > (char *)m->m_data ||
6258 			    (adj = (int)(m->m_data - frame_header)) >
6259 			    m->m_pkthdr.csum_rx_start) {
6260 				m->m_pkthdr.csum_data = 0;
6261 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6262 				hwcksum_in_invalidated++;
6263 			} else {
6264 				m->m_pkthdr.csum_rx_start -= adj;
6265 			}
6266 			/* make sure we don't adjust more than once */
6267 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6268 		}
6269 		if (clat_debug) {
6270 			pktap_input(ifp, protocol_family, m, frame_header);
6271 		}
6272 
6273 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6274 			atomic_add_64(&ifp->if_imcasts, 1);
6275 		}
6276 
6277 		/* run interface filters */
6278 		error = dlil_interface_filters_input(ifp, &m,
6279 		    &frame_header, protocol_family);
6280 		if (error != 0) {
6281 			if (error != EJUSTRETURN) {
6282 				m_freem(m);
6283 			}
6284 			goto next;
6285 		}
6286 		/*
6287 		 * A VLAN interface receives VLAN-tagged packets by attaching
6288 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6289 		 * interface is a member of a bridge, the parent interface
6290 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6291 		 * M_PROMISC packet must be processed by the VLAN protocol
6292 		 * so that it can be sent up the stack via
6293 		 * dlil_input_packet_list(). That allows the bridge interface's
6294 		 * input filter, attached to the VLAN interface, to process
6295 		 * the packet.
6296 		 */
6297 		if (protocol_family != PF_VLAN &&
6298 		    (m->m_flags & M_PROMISC) != 0) {
6299 			m_freem(m);
6300 			goto next;
6301 		}
6302 
6303 		/* Lookup the protocol attachment to this interface */
6304 		if (protocol_family == 0) {
6305 			ifproto = NULL;
6306 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6307 		    (last_ifproto->protocol_family == protocol_family)) {
6308 			VERIFY(ifproto == NULL);
6309 			ifproto = last_ifproto;
6310 			if_proto_ref(last_ifproto);
6311 		} else {
6312 			VERIFY(ifproto == NULL);
6313 			ifnet_lock_shared(ifp);
6314 			/* callee holds a proto refcnt upon success */
6315 			ifproto = find_attached_proto(ifp, protocol_family);
6316 			ifnet_lock_done(ifp);
6317 		}
6318 		if (ifproto == NULL) {
6319 			/* no protocol for this packet, discard */
6320 			m_freem(m);
6321 			goto next;
6322 		}
6323 		if (ifproto != last_ifproto) {
6324 			if (last_ifproto != NULL) {
6325 				/* pass up the list for the previous protocol */
6326 				dlil_ifproto_input(last_ifproto, pkt_first);
6327 				pkt_first = NULL;
6328 				if_proto_free(last_ifproto);
6329 			}
6330 			last_ifproto = ifproto;
6331 			if_proto_ref(ifproto);
6332 		}
6333 		/* extend the list */
6334 		m->m_pkthdr.pkt_hdr = frame_header;
6335 		if (pkt_first == NULL) {
6336 			pkt_first = m;
6337 		} else {
6338 			*pkt_next = m;
6339 		}
6340 		pkt_next = &m->m_nextpkt;
6341 
6342 next:
6343 		if (next_packet == NULL && last_ifproto != NULL) {
6344 			/* pass up the last list of packets */
6345 			dlil_ifproto_input(last_ifproto, pkt_first);
6346 			if_proto_free(last_ifproto);
6347 			last_ifproto = NULL;
6348 		}
6349 		if (ifproto != NULL) {
6350 			if_proto_free(ifproto);
6351 			ifproto = NULL;
6352 		}
6353 
6354 		m = next_packet;
6355 
6356 		/* update the driver's multicast filter, if needed */
6357 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6358 			ifp->if_updatemcasts = 0;
6359 		}
6360 		if (iorefcnt == 1) {
6361 			/* If the next mbuf is on a different interface, unlock data-mov */
6362 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6363 				ifnet_datamov_end(ifp);
6364 				iorefcnt = 0;
6365 			}
6366 		}
6367 	}
6368 
6369 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6370 }
6371 
6372 errno_t
if_mcasts_update(struct ifnet * ifp)6373 if_mcasts_update(struct ifnet *ifp)
6374 {
6375 	errno_t err;
6376 
6377 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6378 	if (err == EAFNOSUPPORT) {
6379 		err = 0;
6380 	}
6381 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6382 	    "(err=%d)\n", if_name(ifp),
6383 	    (err == 0 ? "successfully restored" : "failed to restore"),
6384 	    ifp->if_updatemcasts, err);
6385 
6386 	/* just return success */
6387 	return 0;
6388 }
6389 
6390 /* If ifp is set, we will increment the generation for the interface */
6391 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6392 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6393 {
6394 	if (ifp != NULL) {
6395 		ifnet_increment_generation(ifp);
6396 	}
6397 
6398 #if NECP
6399 	necp_update_all_clients();
6400 #endif /* NECP */
6401 
6402 	return kev_post_msg(event);
6403 }
6404 
6405 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6406 dlil_post_sifflags_msg(struct ifnet * ifp)
6407 {
6408 	struct kev_msg ev_msg;
6409 	struct net_event_data ev_data;
6410 
6411 	bzero(&ev_data, sizeof(ev_data));
6412 	bzero(&ev_msg, sizeof(ev_msg));
6413 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6414 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6415 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6416 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6417 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6418 	ev_data.if_family = ifp->if_family;
6419 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6420 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6421 	ev_msg.dv[0].data_ptr = &ev_data;
6422 	ev_msg.dv[1].data_length = 0;
6423 	dlil_post_complete_msg(ifp, &ev_msg);
6424 }
6425 
6426 #define TMP_IF_PROTO_ARR_SIZE   10
6427 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6428 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6429 {
6430 	struct ifnet_filter *filter = NULL;
6431 	struct if_proto *proto = NULL;
6432 	int if_proto_count = 0;
6433 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6434 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6435 	int tmp_ifproto_arr_idx = 0;
6436 
6437 	/*
6438 	 * Pass the event to the interface filters
6439 	 */
6440 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6441 	/* prevent filter list from changing in case we drop the lock */
6442 	if_flt_monitor_busy(ifp);
6443 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6444 		if (filter->filt_event != NULL) {
6445 			lck_mtx_unlock(&ifp->if_flt_lock);
6446 
6447 			filter->filt_event(filter->filt_cookie, ifp,
6448 			    filter->filt_protocol, event);
6449 
6450 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6451 		}
6452 	}
6453 	/* we're done with the filter list */
6454 	if_flt_monitor_unbusy(ifp);
6455 	lck_mtx_unlock(&ifp->if_flt_lock);
6456 
6457 	/* Get an io ref count if the interface is attached */
6458 	if (!ifnet_is_attached(ifp, 1)) {
6459 		goto done;
6460 	}
6461 
6462 	/*
6463 	 * An embedded tmp_list_entry in if_proto may still get
6464 	 * over-written by another thread after giving up ifnet lock,
6465 	 * therefore we are avoiding embedded pointers here.
6466 	 */
6467 	ifnet_lock_shared(ifp);
6468 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6469 	if (if_proto_count) {
6470 		int i;
6471 		VERIFY(ifp->if_proto_hash != NULL);
6472 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6473 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6474 		} else {
6475 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6476 			    if_proto_count, Z_WAITOK | Z_ZERO);
6477 			if (tmp_ifproto_arr == NULL) {
6478 				ifnet_lock_done(ifp);
6479 				goto cleanup;
6480 			}
6481 		}
6482 
6483 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6484 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6485 			    next_hash) {
6486 				if_proto_ref(proto);
6487 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6488 				tmp_ifproto_arr_idx++;
6489 			}
6490 		}
6491 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6492 	}
6493 	ifnet_lock_done(ifp);
6494 
6495 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6496 	    tmp_ifproto_arr_idx++) {
6497 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6498 		VERIFY(proto != NULL);
6499 		proto_media_event eventp =
6500 		    (proto->proto_kpi == kProtoKPI_v1 ?
6501 		    proto->kpi.v1.event :
6502 		    proto->kpi.v2.event);
6503 
6504 		if (eventp != NULL) {
6505 			eventp(ifp, proto->protocol_family,
6506 			    event);
6507 		}
6508 		if_proto_free(proto);
6509 	}
6510 
6511 cleanup:
6512 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6513 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6514 	}
6515 
6516 	/* Pass the event to the interface */
6517 	if (ifp->if_event != NULL) {
6518 		ifp->if_event(ifp, event);
6519 	}
6520 
6521 	/* Release the io ref count */
6522 	ifnet_decr_iorefcnt(ifp);
6523 done:
6524 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6525 }
6526 
6527 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6528 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6529 {
6530 	struct kev_msg kev_msg;
6531 	int result = 0;
6532 
6533 	if (ifp == NULL || event == NULL) {
6534 		return EINVAL;
6535 	}
6536 
6537 	bzero(&kev_msg, sizeof(kev_msg));
6538 	kev_msg.vendor_code = event->vendor_code;
6539 	kev_msg.kev_class = event->kev_class;
6540 	kev_msg.kev_subclass = event->kev_subclass;
6541 	kev_msg.event_code = event->event_code;
6542 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6543 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6544 	kev_msg.dv[1].data_length = 0;
6545 
6546 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6547 
6548 	return result;
6549 }
6550 
6551 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6552 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6553 {
6554 	mbuf_t  n = m;
6555 	int chainlen = 0;
6556 
6557 	while (n != NULL) {
6558 		chainlen++;
6559 		n = n->m_next;
6560 	}
6561 	switch (chainlen) {
6562 	case 0:
6563 		break;
6564 	case 1:
6565 		atomic_add_64(&cls->cls_one, 1);
6566 		break;
6567 	case 2:
6568 		atomic_add_64(&cls->cls_two, 1);
6569 		break;
6570 	case 3:
6571 		atomic_add_64(&cls->cls_three, 1);
6572 		break;
6573 	case 4:
6574 		atomic_add_64(&cls->cls_four, 1);
6575 		break;
6576 	case 5:
6577 	default:
6578 		atomic_add_64(&cls->cls_five_or_more, 1);
6579 		break;
6580 	}
6581 }
6582 
6583 #if CONFIG_DTRACE
6584 __attribute__((noinline))
6585 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6586 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6587 {
6588 	if (proto_family == PF_INET) {
6589 		struct ip *ip = mtod(m, struct ip *);
6590 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6591 		    struct ip *, ip, struct ifnet *, ifp,
6592 		    struct ip *, ip, struct ip6_hdr *, NULL);
6593 	} else if (proto_family == PF_INET6) {
6594 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6595 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6596 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6597 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6598 	}
6599 }
6600 #endif /* CONFIG_DTRACE */
6601 
6602 /*
6603  * dlil_output
6604  *
6605  * Caller should have a lock on the protocol domain if the protocol
6606  * doesn't support finer grained locking. In most cases, the lock
6607  * will be held from the socket layer and won't be released until
6608  * we return back to the socket layer.
6609  *
6610  * This does mean that we must take a protocol lock before we take
6611  * an interface lock if we're going to take both. This makes sense
6612  * because a protocol is likely to interact with an ifp while it
6613  * is under the protocol lock.
6614  *
6615  * An advisory code will be returned if adv is not null. This
6616  * can be used to provide feedback about interface queues to the
6617  * application.
6618  */
6619 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6620 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6621     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6622 {
6623 	char *frame_type = NULL;
6624 	char *dst_linkaddr = NULL;
6625 	int retval = 0;
6626 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6627 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6628 	struct if_proto *proto = NULL;
6629 	mbuf_t  m = NULL;
6630 	mbuf_t  send_head = NULL;
6631 	mbuf_t  *send_tail = &send_head;
6632 	int iorefcnt = 0;
6633 	u_int32_t pre = 0, post = 0;
6634 	u_int32_t fpkts = 0, fbytes = 0;
6635 	int32_t flen = 0;
6636 	struct timespec now;
6637 	u_int64_t now_nsec;
6638 	boolean_t did_clat46 = FALSE;
6639 	protocol_family_t old_proto_family = proto_family;
6640 	struct sockaddr_in6 dest6;
6641 	struct rtentry *rt = NULL;
6642 	u_int32_t m_loop_set = 0;
6643 
6644 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6645 
6646 	/*
6647 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6648 	 * from happening while this operation is in progress
6649 	 */
6650 	if (!ifnet_datamov_begin(ifp)) {
6651 		retval = ENXIO;
6652 		goto cleanup;
6653 	}
6654 	iorefcnt = 1;
6655 
6656 	VERIFY(ifp->if_output_dlil != NULL);
6657 
6658 	/* update the driver's multicast filter, if needed */
6659 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6660 		ifp->if_updatemcasts = 0;
6661 	}
6662 
6663 	frame_type = frame_type_buffer;
6664 	dst_linkaddr = dst_linkaddr_buffer;
6665 
6666 	if (raw == 0) {
6667 		ifnet_lock_shared(ifp);
6668 		/* callee holds a proto refcnt upon success */
6669 		proto = find_attached_proto(ifp, proto_family);
6670 		if (proto == NULL) {
6671 			ifnet_lock_done(ifp);
6672 			retval = ENXIO;
6673 			goto cleanup;
6674 		}
6675 		ifnet_lock_done(ifp);
6676 	}
6677 
6678 preout_again:
6679 	if (packetlist == NULL) {
6680 		goto cleanup;
6681 	}
6682 
6683 	m = packetlist;
6684 	packetlist = packetlist->m_nextpkt;
6685 	m->m_nextpkt = NULL;
6686 
6687 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6688 
6689 	/*
6690 	 * Perform address family translation for the first
6691 	 * packet outside the loop in order to perform address
6692 	 * lookup for the translated proto family.
6693 	 */
6694 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6695 	    (ifp->if_type == IFT_CELLULAR ||
6696 	    dlil_is_clat_needed(proto_family, m))) {
6697 		retval = dlil_clat46(ifp, &proto_family, &m);
6698 		/*
6699 		 * Go to the next packet if translation fails
6700 		 */
6701 		if (retval != 0) {
6702 			m_freem(m);
6703 			m = NULL;
6704 			ip6stat.ip6s_clat464_out_drop++;
6705 			/* Make sure that the proto family is PF_INET */
6706 			ASSERT(proto_family == PF_INET);
6707 			goto preout_again;
6708 		}
6709 		/*
6710 		 * Free the old one and make it point to the IPv6 proto structure.
6711 		 *
6712 		 * Change proto for the first time we have successfully
6713 		 * performed address family translation.
6714 		 */
6715 		if (!did_clat46 && proto_family == PF_INET6) {
6716 			did_clat46 = TRUE;
6717 
6718 			if (proto != NULL) {
6719 				if_proto_free(proto);
6720 			}
6721 			ifnet_lock_shared(ifp);
6722 			/* callee holds a proto refcnt upon success */
6723 			proto = find_attached_proto(ifp, proto_family);
6724 			if (proto == NULL) {
6725 				ifnet_lock_done(ifp);
6726 				retval = ENXIO;
6727 				m_freem(m);
6728 				m = NULL;
6729 				goto cleanup;
6730 			}
6731 			ifnet_lock_done(ifp);
6732 			if (ifp->if_type == IFT_ETHER) {
6733 				/* Update the dest to translated v6 address */
6734 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6735 				dest6.sin6_family = AF_INET6;
6736 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6737 				dest = (const struct sockaddr *)&dest6;
6738 
6739 				/*
6740 				 * Lookup route to the translated destination
6741 				 * Free this route ref during cleanup
6742 				 */
6743 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6744 				    0, 0, ifp->if_index);
6745 
6746 				route = rt;
6747 			}
6748 		}
6749 	}
6750 
6751 	/*
6752 	 * This path gets packet chain going to the same destination.
6753 	 * The pre output routine is used to either trigger resolution of
6754 	 * the next hop or retreive the next hop's link layer addressing.
6755 	 * For ex: ether_inet(6)_pre_output routine.
6756 	 *
6757 	 * If the routine returns EJUSTRETURN, it implies that packet has
6758 	 * been queued, and therefore we have to call preout_again for the
6759 	 * following packet in the chain.
6760 	 *
6761 	 * For errors other than EJUSTRETURN, the current packet is freed
6762 	 * and the rest of the chain (pointed by packetlist is freed as
6763 	 * part of clean up.
6764 	 *
6765 	 * Else if there is no error the retrieved information is used for
6766 	 * all the packets in the chain.
6767 	 */
6768 	if (raw == 0) {
6769 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6770 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6771 		retval = 0;
6772 		if (preoutp != NULL) {
6773 			retval = preoutp(ifp, proto_family, &m, dest, route,
6774 			    frame_type, dst_linkaddr);
6775 
6776 			if (retval != 0) {
6777 				if (retval == EJUSTRETURN) {
6778 					goto preout_again;
6779 				}
6780 				m_freem(m);
6781 				m = NULL;
6782 				goto cleanup;
6783 			}
6784 		}
6785 	}
6786 
6787 	do {
6788 		/*
6789 		 * pkt_hdr is set here to point to m_data prior to
6790 		 * calling into the framer. This value of pkt_hdr is
6791 		 * used by the netif gso logic to retrieve the ip header
6792 		 * for the TCP packets, offloaded for TSO processing.
6793 		 */
6794 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6795 			uint8_t vlan_encap_len = 0;
6796 
6797 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6798 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6799 			}
6800 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6801 		} else {
6802 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6803 		}
6804 
6805 		/*
6806 		 * Perform address family translation if needed.
6807 		 * For now we only support stateless 4 to 6 translation
6808 		 * on the out path.
6809 		 *
6810 		 * The routine below translates IP header, updates protocol
6811 		 * checksum and also translates ICMP.
6812 		 *
6813 		 * We skip the first packet as it is already translated and
6814 		 * the proto family is set to PF_INET6.
6815 		 */
6816 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6817 		    (ifp->if_type == IFT_CELLULAR ||
6818 		    dlil_is_clat_needed(proto_family, m))) {
6819 			retval = dlil_clat46(ifp, &proto_family, &m);
6820 			/* Goto the next packet if the translation fails */
6821 			if (retval != 0) {
6822 				m_freem(m);
6823 				m = NULL;
6824 				ip6stat.ip6s_clat464_out_drop++;
6825 				goto next;
6826 			}
6827 		}
6828 
6829 #if CONFIG_DTRACE
6830 		if (!raw) {
6831 			dlil_output_dtrace(ifp, proto_family, m);
6832 		}
6833 #endif /* CONFIG_DTRACE */
6834 
6835 		if (raw == 0 && ifp->if_framer != NULL) {
6836 			int rcvif_set = 0;
6837 
6838 			/*
6839 			 * If this is a broadcast packet that needs to be
6840 			 * looped back into the system, set the inbound ifp
6841 			 * to that of the outbound ifp.  This will allow
6842 			 * us to determine that it is a legitimate packet
6843 			 * for the system.  Only set the ifp if it's not
6844 			 * already set, just to be safe.
6845 			 */
6846 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6847 			    m->m_pkthdr.rcvif == NULL) {
6848 				m->m_pkthdr.rcvif = ifp;
6849 				rcvif_set = 1;
6850 			}
6851 			m_loop_set = m->m_flags & M_LOOP;
6852 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6853 			    frame_type, &pre, &post);
6854 			if (retval != 0) {
6855 				if (retval != EJUSTRETURN) {
6856 					m_freem(m);
6857 				}
6858 				goto next;
6859 			}
6860 
6861 			/*
6862 			 * For partial checksum offload, adjust the start
6863 			 * and stuff offsets based on the prepended header.
6864 			 */
6865 			if ((m->m_pkthdr.csum_flags &
6866 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6867 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6868 				m->m_pkthdr.csum_tx_stuff += pre;
6869 				m->m_pkthdr.csum_tx_start += pre;
6870 			}
6871 
6872 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6873 				dlil_output_cksum_dbg(ifp, m, pre,
6874 				    proto_family);
6875 			}
6876 
6877 			/*
6878 			 * Clear the ifp if it was set above, and to be
6879 			 * safe, only if it is still the same as the
6880 			 * outbound ifp we have in context.  If it was
6881 			 * looped back, then a copy of it was sent to the
6882 			 * loopback interface with the rcvif set, and we
6883 			 * are clearing the one that will go down to the
6884 			 * layer below.
6885 			 */
6886 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6887 				m->m_pkthdr.rcvif = NULL;
6888 			}
6889 		}
6890 
6891 		/*
6892 		 * Let interface filters (if any) do their thing ...
6893 		 */
6894 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6895 		if (retval != 0) {
6896 			if (retval != EJUSTRETURN) {
6897 				m_freem(m);
6898 			}
6899 			goto next;
6900 		}
6901 		/*
6902 		 * Strip away M_PROTO1 bit prior to sending packet
6903 		 * to the driver as this field may be used by the driver
6904 		 */
6905 		m->m_flags &= ~M_PROTO1;
6906 
6907 		/*
6908 		 * If the underlying interface is not capable of handling a
6909 		 * packet whose data portion spans across physically disjoint
6910 		 * pages, we need to "normalize" the packet so that we pass
6911 		 * down a chain of mbufs where each mbuf points to a span that
6912 		 * resides in the system page boundary.  If the packet does
6913 		 * not cross page(s), the following is a no-op.
6914 		 */
6915 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6916 			if ((m = m_normalize(m)) == NULL) {
6917 				goto next;
6918 			}
6919 		}
6920 
6921 		/*
6922 		 * If this is a TSO packet, make sure the interface still
6923 		 * advertise TSO capability.
6924 		 */
6925 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6926 			retval = EMSGSIZE;
6927 			m_freem(m);
6928 			goto cleanup;
6929 		}
6930 
6931 		ifp_inc_traffic_class_out(ifp, m);
6932 
6933 #if SKYWALK
6934 		/*
6935 		 * For native skywalk devices, packets will be passed to pktap
6936 		 * after GSO or after the mbuf to packet conversion.
6937 		 * This is done for IPv4/IPv6 packets only because there is no
6938 		 * space in the mbuf to pass down the proto family.
6939 		 */
6940 		if (dlil_is_native_netif_nexus(ifp)) {
6941 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6942 				pktap_output(ifp, proto_family, m, pre, post);
6943 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6944 			}
6945 		} else {
6946 			pktap_output(ifp, proto_family, m, pre, post);
6947 		}
6948 #else /* SKYWALK */
6949 		pktap_output(ifp, proto_family, m, pre, post);
6950 #endif /* SKYWALK */
6951 
6952 		/*
6953 		 * Count the number of elements in the mbuf chain
6954 		 */
6955 		if (tx_chain_len_count) {
6956 			dlil_count_chain_len(m, &tx_chain_len_stats);
6957 		}
6958 
6959 		/*
6960 		 * Record timestamp; ifnet_enqueue() will use this info
6961 		 * rather than redoing the work.  An optimization could
6962 		 * involve doing this just once at the top, if there are
6963 		 * no interface filters attached, but that's probably
6964 		 * not a big deal.
6965 		 */
6966 		nanouptime(&now);
6967 		net_timernsec(&now, &now_nsec);
6968 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6969 
6970 		/*
6971 		 * Discard partial sum information if this packet originated
6972 		 * from another interface; the packet would already have the
6973 		 * final checksum and we shouldn't recompute it.
6974 		 */
6975 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6976 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6977 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6978 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6979 			m->m_pkthdr.csum_data = 0;
6980 		}
6981 
6982 		/*
6983 		 * Finally, call the driver.
6984 		 */
6985 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6986 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6987 				flen += (m_pktlen(m) - (pre + post));
6988 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6989 			}
6990 			*send_tail = m;
6991 			send_tail = &m->m_nextpkt;
6992 		} else {
6993 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6994 				flen = (m_pktlen(m) - (pre + post));
6995 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6996 			} else {
6997 				flen = 0;
6998 			}
6999 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7000 			    0, 0, 0, 0, 0);
7001 			retval = (*ifp->if_output_dlil)(ifp, m);
7002 			if (retval == EQFULL || retval == EQSUSPENDED) {
7003 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7004 					adv->code = (retval == EQFULL ?
7005 					    FADV_FLOW_CONTROLLED :
7006 					    FADV_SUSPENDED);
7007 				}
7008 				retval = 0;
7009 			}
7010 			if (retval == 0 && flen > 0) {
7011 				fbytes += flen;
7012 				fpkts++;
7013 			}
7014 			if (retval != 0 && dlil_verbose) {
7015 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7016 				    __func__, if_name(ifp),
7017 				    retval);
7018 			}
7019 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7020 			    0, 0, 0, 0, 0);
7021 		}
7022 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7023 
7024 next:
7025 		m = packetlist;
7026 		if (m != NULL) {
7027 			m->m_flags |= m_loop_set;
7028 			packetlist = packetlist->m_nextpkt;
7029 			m->m_nextpkt = NULL;
7030 		}
7031 		/* Reset the proto family to old proto family for CLAT */
7032 		if (did_clat46) {
7033 			proto_family = old_proto_family;
7034 		}
7035 	} while (m != NULL);
7036 
7037 	if (send_head != NULL) {
7038 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7039 		    0, 0, 0, 0, 0);
7040 		if (ifp->if_eflags & IFEF_SENDLIST) {
7041 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7042 			if (retval == EQFULL || retval == EQSUSPENDED) {
7043 				if (adv != NULL) {
7044 					adv->code = (retval == EQFULL ?
7045 					    FADV_FLOW_CONTROLLED :
7046 					    FADV_SUSPENDED);
7047 				}
7048 				retval = 0;
7049 			}
7050 			if (retval == 0 && flen > 0) {
7051 				fbytes += flen;
7052 				fpkts++;
7053 			}
7054 			if (retval != 0 && dlil_verbose) {
7055 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7056 				    __func__, if_name(ifp), retval);
7057 			}
7058 		} else {
7059 			struct mbuf *send_m;
7060 			int enq_cnt = 0;
7061 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7062 			while (send_head != NULL) {
7063 				send_m = send_head;
7064 				send_head = send_m->m_nextpkt;
7065 				send_m->m_nextpkt = NULL;
7066 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7067 				if (retval == EQFULL || retval == EQSUSPENDED) {
7068 					if (adv != NULL) {
7069 						adv->code = (retval == EQFULL ?
7070 						    FADV_FLOW_CONTROLLED :
7071 						    FADV_SUSPENDED);
7072 					}
7073 					retval = 0;
7074 				}
7075 				if (retval == 0) {
7076 					enq_cnt++;
7077 					if (flen > 0) {
7078 						fpkts++;
7079 					}
7080 				}
7081 				if (retval != 0 && dlil_verbose) {
7082 					DLIL_PRINTF("%s: output error on %s "
7083 					    "retval = %d\n",
7084 					    __func__, if_name(ifp), retval);
7085 				}
7086 			}
7087 			if (enq_cnt > 0) {
7088 				fbytes += flen;
7089 				ifnet_start(ifp);
7090 			}
7091 		}
7092 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7093 	}
7094 
7095 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7096 
7097 cleanup:
7098 	if (fbytes > 0) {
7099 		ifp->if_fbytes += fbytes;
7100 	}
7101 	if (fpkts > 0) {
7102 		ifp->if_fpackets += fpkts;
7103 	}
7104 	if (proto != NULL) {
7105 		if_proto_free(proto);
7106 	}
7107 	if (packetlist) { /* if any packets are left, clean up */
7108 		mbuf_freem_list(packetlist);
7109 	}
7110 	if (retval == EJUSTRETURN) {
7111 		retval = 0;
7112 	}
7113 	if (iorefcnt == 1) {
7114 		ifnet_datamov_end(ifp);
7115 	}
7116 	if (rt != NULL) {
7117 		rtfree(rt);
7118 		rt = NULL;
7119 	}
7120 
7121 	return retval;
7122 }
7123 
7124 /*
7125  * This routine checks if the destination address is not a loopback, link-local,
7126  * multicast or broadcast address.
7127  */
7128 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7129 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7130 {
7131 	int ret = 0;
7132 	switch (proto_family) {
7133 	case PF_INET: {
7134 		struct ip *iph = mtod(m, struct ip *);
7135 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7136 			ret = 1;
7137 		}
7138 		break;
7139 	}
7140 	case PF_INET6: {
7141 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7142 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7143 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7144 			ret = 1;
7145 		}
7146 		break;
7147 	}
7148 	}
7149 
7150 	return ret;
7151 }
7152 /*
7153  * @brief This routine translates IPv4 packet to IPv6 packet,
7154  *     updates protocol checksum and also translates ICMP for code
7155  *     along with inner header translation.
7156  *
7157  * @param ifp Pointer to the interface
7158  * @param proto_family pointer to protocol family. It is updated if function
7159  *     performs the translation successfully.
7160  * @param m Pointer to the pointer pointing to the packet. Needed because this
7161  *     routine can end up changing the mbuf to a different one.
7162  *
7163  * @return 0 on success or else a negative value.
7164  */
7165 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7166 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7167 {
7168 	VERIFY(*proto_family == PF_INET);
7169 	VERIFY(IS_INTF_CLAT46(ifp));
7170 
7171 	pbuf_t pbuf_store, *pbuf = NULL;
7172 	struct ip *iph = NULL;
7173 	struct in_addr osrc, odst;
7174 	uint8_t proto = 0;
7175 	struct in6_ifaddr *ia6_clat_src = NULL;
7176 	struct in6_addr *src = NULL;
7177 	struct in6_addr dst;
7178 	int error = 0;
7179 	uint16_t off = 0;
7180 	uint16_t tot_len = 0;
7181 	uint16_t ip_id_val = 0;
7182 	uint16_t ip_frag_off = 0;
7183 
7184 	boolean_t is_frag = FALSE;
7185 	boolean_t is_first_frag = TRUE;
7186 	boolean_t is_last_frag = TRUE;
7187 
7188 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7189 	pbuf = &pbuf_store;
7190 	iph = pbuf->pb_data;
7191 
7192 	osrc = iph->ip_src;
7193 	odst = iph->ip_dst;
7194 	proto = iph->ip_p;
7195 	off = (uint16_t)(iph->ip_hl << 2);
7196 	ip_id_val = iph->ip_id;
7197 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7198 
7199 	tot_len = ntohs(iph->ip_len);
7200 
7201 	/*
7202 	 * For packets that are not first frags
7203 	 * we only need to adjust CSUM.
7204 	 * For 4 to 6, Fragmentation header gets appended
7205 	 * after proto translation.
7206 	 */
7207 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7208 		is_frag = TRUE;
7209 
7210 		/* If the offset is not zero, it is not first frag */
7211 		if (ip_frag_off != 0) {
7212 			is_first_frag = FALSE;
7213 		}
7214 
7215 		/* If IP_MF is set, then it is not last frag */
7216 		if (ntohs(iph->ip_off) & IP_MF) {
7217 			is_last_frag = FALSE;
7218 		}
7219 	}
7220 
7221 	/*
7222 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7223 	 * translation.
7224 	 */
7225 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7226 	if (ia6_clat_src == NULL) {
7227 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7228 		error = -1;
7229 		goto cleanup;
7230 	}
7231 
7232 	src = &ia6_clat_src->ia_addr.sin6_addr;
7233 
7234 	/*
7235 	 * Translate IPv4 destination to IPv6 destination by using the
7236 	 * prefixes learned through prior PLAT discovery.
7237 	 */
7238 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7239 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7240 		goto cleanup;
7241 	}
7242 
7243 	/* Translate the IP header part first */
7244 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7245 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7246 
7247 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7248 
7249 	if (error != 0) {
7250 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7251 		goto cleanup;
7252 	}
7253 
7254 	/*
7255 	 * Translate protocol header, update checksum, checksum flags
7256 	 * and related fields.
7257 	 */
7258 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7259 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7260 
7261 	if (error != 0) {
7262 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7263 		goto cleanup;
7264 	}
7265 
7266 	/* Now insert the IPv6 fragment header */
7267 	if (is_frag) {
7268 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7269 
7270 		if (error != 0) {
7271 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7272 			goto cleanup;
7273 		}
7274 	}
7275 
7276 cleanup:
7277 	if (ia6_clat_src != NULL) {
7278 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7279 	}
7280 
7281 	if (pbuf_is_valid(pbuf)) {
7282 		*m = pbuf->pb_mbuf;
7283 		pbuf->pb_mbuf = NULL;
7284 		pbuf_destroy(pbuf);
7285 	} else {
7286 		error = -1;
7287 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7288 	}
7289 
7290 	if (error == 0) {
7291 		*proto_family = PF_INET6;
7292 		ip6stat.ip6s_clat464_out_success++;
7293 	}
7294 
7295 	return error;
7296 }
7297 
7298 /*
7299  * @brief This routine translates incoming IPv6 to IPv4 packet,
7300  *     updates protocol checksum and also translates ICMPv6 outer
7301  *     and inner headers
7302  *
7303  * @return 0 on success or else a negative value.
7304  */
7305 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7306 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7307 {
7308 	VERIFY(*proto_family == PF_INET6);
7309 	VERIFY(IS_INTF_CLAT46(ifp));
7310 
7311 	struct ip6_hdr *ip6h = NULL;
7312 	struct in6_addr osrc, odst;
7313 	uint8_t proto = 0;
7314 	struct in6_ifaddr *ia6_clat_dst = NULL;
7315 	struct in_ifaddr *ia4_clat_dst = NULL;
7316 	struct in_addr *dst = NULL;
7317 	struct in_addr src;
7318 	int error = 0;
7319 	uint32_t off = 0;
7320 	u_int64_t tot_len = 0;
7321 	uint8_t tos = 0;
7322 	boolean_t is_first_frag = TRUE;
7323 
7324 	/* Incoming mbuf does not contain valid IP6 header */
7325 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7326 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7327 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7328 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7329 		return -1;
7330 	}
7331 
7332 	ip6h = mtod(*m, struct ip6_hdr *);
7333 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7334 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7335 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7336 		return -1;
7337 	}
7338 
7339 	osrc = ip6h->ip6_src;
7340 	odst = ip6h->ip6_dst;
7341 
7342 	/*
7343 	 * Retrieve the local CLAT46 reserved IPv6 address.
7344 	 * Let the packet pass if we don't find one, as the flag
7345 	 * may get set before IPv6 configuration has taken place.
7346 	 */
7347 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7348 	if (ia6_clat_dst == NULL) {
7349 		goto done;
7350 	}
7351 
7352 	/*
7353 	 * Check if the original dest in the packet is same as the reserved
7354 	 * CLAT46 IPv6 address
7355 	 */
7356 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7357 		pbuf_t pbuf_store, *pbuf = NULL;
7358 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7359 		pbuf = &pbuf_store;
7360 
7361 		/*
7362 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7363 		 * translation.
7364 		 */
7365 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7366 		if (ia4_clat_dst == NULL) {
7367 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7368 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7369 			error = -1;
7370 			goto cleanup;
7371 		}
7372 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7373 
7374 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7375 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7376 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7377 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7378 			error = -1;
7379 			goto cleanup;
7380 		}
7381 
7382 		ip6h = pbuf->pb_data;
7383 		off = sizeof(struct ip6_hdr);
7384 		proto = ip6h->ip6_nxt;
7385 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7386 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7387 
7388 		/*
7389 		 * Translate the IP header and update the fragmentation
7390 		 * header if needed
7391 		 */
7392 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7393 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7394 		    0 : -1;
7395 
7396 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7397 
7398 		if (error != 0) {
7399 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7400 			goto cleanup;
7401 		}
7402 
7403 		/*
7404 		 * Translate protocol header, update checksum, checksum flags
7405 		 * and related fields.
7406 		 */
7407 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7408 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7409 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7410 
7411 		if (error != 0) {
7412 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7413 			goto cleanup;
7414 		}
7415 
7416 cleanup:
7417 		if (ia4_clat_dst != NULL) {
7418 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7419 		}
7420 
7421 		if (pbuf_is_valid(pbuf)) {
7422 			*m = pbuf->pb_mbuf;
7423 			pbuf->pb_mbuf = NULL;
7424 			pbuf_destroy(pbuf);
7425 		} else {
7426 			error = -1;
7427 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7428 		}
7429 
7430 		if (error == 0) {
7431 			*proto_family = PF_INET;
7432 			ip6stat.ip6s_clat464_in_success++;
7433 		}
7434 	} /* CLAT traffic */
7435 
7436 done:
7437 	return error;
7438 }
7439 
7440 /* The following is used to enqueue work items for ifnet ioctl events */
7441 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7442 
7443 struct ifnet_ioctl_event {
7444 	struct ifnet *ifp;
7445 	u_long ioctl_code;
7446 };
7447 
7448 struct ifnet_ioctl_event_nwk_wq_entry {
7449 	struct nwk_wq_entry nwk_wqe;
7450 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7451 };
7452 
7453 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7454 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7455 {
7456 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7457 
7458 	/*
7459 	 * Get an io ref count if the interface is attached.
7460 	 * At this point it most likely is. We are taking a reference for
7461 	 * deferred processing.
7462 	 */
7463 	if (!ifnet_is_attached(ifp, 1)) {
7464 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7465 		    "is not attached",
7466 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7467 		return;
7468 	}
7469 
7470 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7471 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7472 
7473 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7474 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7475 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7476 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7477 }
7478 
7479 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7480 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7481 {
7482 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7483 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7484 
7485 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7486 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7487 	int ret = 0;
7488 
7489 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7490 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7491 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7492 	} else if (dlil_verbose) {
7493 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7494 		    "for ioctl %lu",
7495 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7496 	}
7497 	ifnet_decr_iorefcnt(ifp);
7498 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7499 	return;
7500 }
7501 
7502 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7503 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7504     void *ioctl_arg)
7505 {
7506 	struct ifnet_filter *filter;
7507 	int retval = EOPNOTSUPP;
7508 	int result = 0;
7509 
7510 	if (ifp == NULL || ioctl_code == 0) {
7511 		return EINVAL;
7512 	}
7513 
7514 	/* Get an io ref count if the interface is attached */
7515 	if (!ifnet_is_attached(ifp, 1)) {
7516 		return EOPNOTSUPP;
7517 	}
7518 
7519 	/*
7520 	 * Run the interface filters first.
7521 	 * We want to run all filters before calling the protocol,
7522 	 * interface family, or interface.
7523 	 */
7524 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7525 	/* prevent filter list from changing in case we drop the lock */
7526 	if_flt_monitor_busy(ifp);
7527 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7528 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7529 		    filter->filt_protocol == proto_fam)) {
7530 			lck_mtx_unlock(&ifp->if_flt_lock);
7531 
7532 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7533 			    proto_fam, ioctl_code, ioctl_arg);
7534 
7535 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7536 
7537 			/* Only update retval if no one has handled the ioctl */
7538 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7539 				if (result == ENOTSUP) {
7540 					result = EOPNOTSUPP;
7541 				}
7542 				retval = result;
7543 				if (retval != 0 && retval != EOPNOTSUPP) {
7544 					/* we're done with the filter list */
7545 					if_flt_monitor_unbusy(ifp);
7546 					lck_mtx_unlock(&ifp->if_flt_lock);
7547 					goto cleanup;
7548 				}
7549 			}
7550 		}
7551 	}
7552 	/* we're done with the filter list */
7553 	if_flt_monitor_unbusy(ifp);
7554 	lck_mtx_unlock(&ifp->if_flt_lock);
7555 
7556 	/* Allow the protocol to handle the ioctl */
7557 	if (proto_fam != 0) {
7558 		struct if_proto *proto;
7559 
7560 		/* callee holds a proto refcnt upon success */
7561 		ifnet_lock_shared(ifp);
7562 		proto = find_attached_proto(ifp, proto_fam);
7563 		ifnet_lock_done(ifp);
7564 		if (proto != NULL) {
7565 			proto_media_ioctl ioctlp =
7566 			    (proto->proto_kpi == kProtoKPI_v1 ?
7567 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7568 			result = EOPNOTSUPP;
7569 			if (ioctlp != NULL) {
7570 				result = ioctlp(ifp, proto_fam, ioctl_code,
7571 				    ioctl_arg);
7572 			}
7573 			if_proto_free(proto);
7574 
7575 			/* Only update retval if no one has handled the ioctl */
7576 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7577 				if (result == ENOTSUP) {
7578 					result = EOPNOTSUPP;
7579 				}
7580 				retval = result;
7581 				if (retval && retval != EOPNOTSUPP) {
7582 					goto cleanup;
7583 				}
7584 			}
7585 		}
7586 	}
7587 
7588 	/* retval is either 0 or EOPNOTSUPP */
7589 
7590 	/*
7591 	 * Let the interface handle this ioctl.
7592 	 * If it returns EOPNOTSUPP, ignore that, we may have
7593 	 * already handled this in the protocol or family.
7594 	 */
7595 	if (ifp->if_ioctl) {
7596 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7597 	}
7598 
7599 	/* Only update retval if no one has handled the ioctl */
7600 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7601 		if (result == ENOTSUP) {
7602 			result = EOPNOTSUPP;
7603 		}
7604 		retval = result;
7605 		if (retval && retval != EOPNOTSUPP) {
7606 			goto cleanup;
7607 		}
7608 	}
7609 
7610 cleanup:
7611 	if (retval == EJUSTRETURN) {
7612 		retval = 0;
7613 	}
7614 
7615 	ifnet_decr_iorefcnt(ifp);
7616 
7617 	return retval;
7618 }
7619 
7620 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7621 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7622 {
7623 	errno_t error = 0;
7624 
7625 
7626 	if (ifp->if_set_bpf_tap) {
7627 		/* Get an io reference on the interface if it is attached */
7628 		if (!ifnet_is_attached(ifp, 1)) {
7629 			return ENXIO;
7630 		}
7631 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7632 		ifnet_decr_iorefcnt(ifp);
7633 	}
7634 	return error;
7635 }
7636 
7637 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7638 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7639     struct sockaddr *ll_addr, size_t ll_len)
7640 {
7641 	errno_t result = EOPNOTSUPP;
7642 	struct if_proto *proto;
7643 	const struct sockaddr *verify;
7644 	proto_media_resolve_multi resolvep;
7645 
7646 	if (!ifnet_is_attached(ifp, 1)) {
7647 		return result;
7648 	}
7649 
7650 	bzero(ll_addr, ll_len);
7651 
7652 	/* Call the protocol first; callee holds a proto refcnt upon success */
7653 	ifnet_lock_shared(ifp);
7654 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7655 	ifnet_lock_done(ifp);
7656 	if (proto != NULL) {
7657 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7658 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7659 		if (resolvep != NULL) {
7660 			result = resolvep(ifp, proto_addr,
7661 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7662 		}
7663 		if_proto_free(proto);
7664 	}
7665 
7666 	/* Let the interface verify the multicast address */
7667 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7668 		if (result == 0) {
7669 			verify = ll_addr;
7670 		} else {
7671 			verify = proto_addr;
7672 		}
7673 		result = ifp->if_check_multi(ifp, verify);
7674 	}
7675 
7676 	ifnet_decr_iorefcnt(ifp);
7677 	return result;
7678 }
7679 
7680 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7681 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7682     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7683     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7684 {
7685 	struct if_proto *proto;
7686 	errno_t result = 0;
7687 
7688 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7689 		result = ENOTSUP;
7690 		goto done;
7691 	}
7692 
7693 	/* callee holds a proto refcnt upon success */
7694 	ifnet_lock_shared(ifp);
7695 	proto = find_attached_proto(ifp, target_proto->sa_family);
7696 	ifnet_lock_done(ifp);
7697 	if (proto == NULL) {
7698 		result = ENOTSUP;
7699 	} else {
7700 		proto_media_send_arp    arpp;
7701 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7702 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7703 		if (arpp == NULL) {
7704 			result = ENOTSUP;
7705 		} else {
7706 			switch (arpop) {
7707 			case ARPOP_REQUEST:
7708 				arpstat.txrequests++;
7709 				if (target_hw != NULL) {
7710 					arpstat.txurequests++;
7711 				}
7712 				break;
7713 			case ARPOP_REPLY:
7714 				arpstat.txreplies++;
7715 				break;
7716 			}
7717 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7718 			    target_hw, target_proto);
7719 		}
7720 		if_proto_free(proto);
7721 	}
7722 done:
7723 	return result;
7724 }
7725 
7726 struct net_thread_marks { };
7727 static const struct net_thread_marks net_thread_marks_base = { };
7728 
7729 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7730     &net_thread_marks_base;
7731 
7732 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7733 net_thread_marks_push(u_int32_t push)
7734 {
7735 	static const char *const base = (const void*)&net_thread_marks_base;
7736 	u_int32_t pop = 0;
7737 
7738 	if (push != 0) {
7739 		struct uthread *uth = current_uthread();
7740 
7741 		pop = push & ~uth->uu_network_marks;
7742 		if (pop != 0) {
7743 			uth->uu_network_marks |= pop;
7744 		}
7745 	}
7746 
7747 	return (net_thread_marks_t)&base[pop];
7748 }
7749 
7750 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7751 net_thread_unmarks_push(u_int32_t unpush)
7752 {
7753 	static const char *const base = (const void*)&net_thread_marks_base;
7754 	u_int32_t unpop = 0;
7755 
7756 	if (unpush != 0) {
7757 		struct uthread *uth = current_uthread();
7758 
7759 		unpop = unpush & uth->uu_network_marks;
7760 		if (unpop != 0) {
7761 			uth->uu_network_marks &= ~unpop;
7762 		}
7763 	}
7764 
7765 	return (net_thread_marks_t)&base[unpop];
7766 }
7767 
7768 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7769 net_thread_marks_pop(net_thread_marks_t popx)
7770 {
7771 	static const char *const base = (const void*)&net_thread_marks_base;
7772 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7773 
7774 	if (pop != 0) {
7775 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7776 		struct uthread *uth = current_uthread();
7777 
7778 		VERIFY((pop & ones) == pop);
7779 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7780 		uth->uu_network_marks &= ~pop;
7781 	}
7782 }
7783 
7784 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7785 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7786 {
7787 	static const char *const base = (const void*)&net_thread_marks_base;
7788 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7789 
7790 	if (unpop != 0) {
7791 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7792 		struct uthread *uth = current_uthread();
7793 
7794 		VERIFY((unpop & ones) == unpop);
7795 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7796 		uth->uu_network_marks |= unpop;
7797 	}
7798 }
7799 
7800 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7801 net_thread_is_marked(u_int32_t check)
7802 {
7803 	if (check != 0) {
7804 		struct uthread *uth = current_uthread();
7805 		return uth->uu_network_marks & check;
7806 	} else {
7807 		return 0;
7808 	}
7809 }
7810 
7811 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7812 net_thread_is_unmarked(u_int32_t check)
7813 {
7814 	if (check != 0) {
7815 		struct uthread *uth = current_uthread();
7816 		return ~uth->uu_network_marks & check;
7817 	} else {
7818 		return 0;
7819 	}
7820 }
7821 
7822 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7823 _is_announcement(const struct sockaddr_in * sender_sin,
7824     const struct sockaddr_in * target_sin)
7825 {
7826 	if (target_sin == NULL || sender_sin == NULL) {
7827 		return FALSE;
7828 	}
7829 
7830 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7831 }
7832 
7833 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7834 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7835     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7836     const struct sockaddr *target_proto0, u_int32_t rtflags)
7837 {
7838 	errno_t result = 0;
7839 	const struct sockaddr_in * sender_sin;
7840 	const struct sockaddr_in * target_sin;
7841 	struct sockaddr_inarp target_proto_sinarp;
7842 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7843 
7844 	if (target_proto == NULL || sender_proto == NULL) {
7845 		return EINVAL;
7846 	}
7847 
7848 	if (sender_proto->sa_family != target_proto->sa_family) {
7849 		return EINVAL;
7850 	}
7851 
7852 	/*
7853 	 * If the target is a (default) router, provide that
7854 	 * information to the send_arp callback routine.
7855 	 */
7856 	if (rtflags & RTF_ROUTER) {
7857 		bcopy(target_proto, &target_proto_sinarp,
7858 		    sizeof(struct sockaddr_in));
7859 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7860 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7861 	}
7862 
7863 	/*
7864 	 * If this is an ARP request and the target IP is IPv4LL,
7865 	 * send the request on all interfaces.  The exception is
7866 	 * an announcement, which must only appear on the specific
7867 	 * interface.
7868 	 */
7869 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7870 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7871 	if (target_proto->sa_family == AF_INET &&
7872 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7873 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7874 	    !_is_announcement(sender_sin, target_sin)) {
7875 		ifnet_t         *ifp_list;
7876 		u_int32_t       count;
7877 		u_int32_t       ifp_on;
7878 
7879 		result = ENOTSUP;
7880 
7881 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7882 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7883 				errno_t new_result;
7884 				ifaddr_t source_hw = NULL;
7885 				ifaddr_t source_ip = NULL;
7886 				struct sockaddr_in source_ip_copy;
7887 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7888 
7889 				/*
7890 				 * Only arp on interfaces marked for IPv4LL
7891 				 * ARPing.  This may mean that we don't ARP on
7892 				 * the interface the subnet route points to.
7893 				 */
7894 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7895 					continue;
7896 				}
7897 
7898 				/* Find the source IP address */
7899 				ifnet_lock_shared(cur_ifp);
7900 				source_hw = cur_ifp->if_lladdr;
7901 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7902 				    ifa_link) {
7903 					IFA_LOCK(source_ip);
7904 					if (source_ip->ifa_addr != NULL &&
7905 					    source_ip->ifa_addr->sa_family ==
7906 					    AF_INET) {
7907 						/* Copy the source IP address */
7908 						source_ip_copy =
7909 						    *(struct sockaddr_in *)
7910 						    (void *)source_ip->ifa_addr;
7911 						IFA_UNLOCK(source_ip);
7912 						break;
7913 					}
7914 					IFA_UNLOCK(source_ip);
7915 				}
7916 
7917 				/* No IP Source, don't arp */
7918 				if (source_ip == NULL) {
7919 					ifnet_lock_done(cur_ifp);
7920 					continue;
7921 				}
7922 
7923 				IFA_ADDREF(source_hw);
7924 				ifnet_lock_done(cur_ifp);
7925 
7926 				/* Send the ARP */
7927 				new_result = dlil_send_arp_internal(cur_ifp,
7928 				    arpop, (struct sockaddr_dl *)(void *)
7929 				    source_hw->ifa_addr,
7930 				    (struct sockaddr *)&source_ip_copy, NULL,
7931 				    target_proto);
7932 
7933 				IFA_REMREF(source_hw);
7934 				if (result == ENOTSUP) {
7935 					result = new_result;
7936 				}
7937 			}
7938 			ifnet_list_free(ifp_list);
7939 		}
7940 	} else {
7941 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7942 		    sender_proto, target_hw, target_proto);
7943 	}
7944 
7945 	return result;
7946 }
7947 
7948 /*
7949  * Caller must hold ifnet head lock.
7950  */
7951 static int
ifnet_lookup(struct ifnet * ifp)7952 ifnet_lookup(struct ifnet *ifp)
7953 {
7954 	struct ifnet *_ifp;
7955 
7956 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7957 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7958 		if (_ifp == ifp) {
7959 			break;
7960 		}
7961 	}
7962 	return _ifp != NULL;
7963 }
7964 
7965 /*
7966  * Caller has to pass a non-zero refio argument to get a
7967  * IO reference count. This will prevent ifnet_detach from
7968  * being called when there are outstanding io reference counts.
7969  */
7970 int
ifnet_is_attached(struct ifnet * ifp,int refio)7971 ifnet_is_attached(struct ifnet *ifp, int refio)
7972 {
7973 	int ret;
7974 
7975 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7976 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7977 		if (refio > 0) {
7978 			ifp->if_refio++;
7979 		}
7980 	}
7981 	lck_mtx_unlock(&ifp->if_ref_lock);
7982 
7983 	return ret;
7984 }
7985 
7986 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7987 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7988 {
7989 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7990 	ifp->if_threads_pending++;
7991 	lck_mtx_unlock(&ifp->if_ref_lock);
7992 }
7993 
7994 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7995 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7996 {
7997 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7998 	VERIFY(ifp->if_threads_pending > 0);
7999 	ifp->if_threads_pending--;
8000 	if (ifp->if_threads_pending == 0) {
8001 		wakeup(&ifp->if_threads_pending);
8002 	}
8003 	lck_mtx_unlock(&ifp->if_ref_lock);
8004 }
8005 
8006 /*
8007  * Caller must ensure the interface is attached; the assumption is that
8008  * there is at least an outstanding IO reference count held already.
8009  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8010  */
8011 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8012 ifnet_incr_iorefcnt(struct ifnet *ifp)
8013 {
8014 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8015 	VERIFY(IF_FULLY_ATTACHED(ifp));
8016 	VERIFY(ifp->if_refio > 0);
8017 	ifp->if_refio++;
8018 	lck_mtx_unlock(&ifp->if_ref_lock);
8019 }
8020 
8021 __attribute__((always_inline))
8022 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8023 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8024 {
8025 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8026 
8027 	VERIFY(ifp->if_refio > 0);
8028 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8029 
8030 	ifp->if_refio--;
8031 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8032 
8033 	/*
8034 	 * if there are no more outstanding io references, wakeup the
8035 	 * ifnet_detach thread if detaching flag is set.
8036 	 */
8037 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8038 		wakeup(&(ifp->if_refio));
8039 	}
8040 }
8041 
8042 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8043 ifnet_decr_iorefcnt(struct ifnet *ifp)
8044 {
8045 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8046 	ifnet_decr_iorefcnt_locked(ifp);
8047 	lck_mtx_unlock(&ifp->if_ref_lock);
8048 }
8049 
8050 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8051 ifnet_datamov_begin(struct ifnet *ifp)
8052 {
8053 	boolean_t ret;
8054 
8055 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8056 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8057 		ifp->if_refio++;
8058 		ifp->if_datamov++;
8059 	}
8060 	lck_mtx_unlock(&ifp->if_ref_lock);
8061 
8062 	return ret;
8063 }
8064 
8065 void
ifnet_datamov_end(struct ifnet * ifp)8066 ifnet_datamov_end(struct ifnet *ifp)
8067 {
8068 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8069 	VERIFY(ifp->if_datamov > 0);
8070 	/*
8071 	 * if there's no more thread moving data, wakeup any
8072 	 * drainers that's blocked waiting for this.
8073 	 */
8074 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8075 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8076 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8077 		wakeup(&(ifp->if_datamov));
8078 	}
8079 	ifnet_decr_iorefcnt_locked(ifp);
8080 	lck_mtx_unlock(&ifp->if_ref_lock);
8081 }
8082 
8083 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8084 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8085 {
8086 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8087 	ifp->if_refio++;
8088 	if (ifp->if_suspend++ == 0) {
8089 		VERIFY(ifp->if_refflags & IFRF_READY);
8090 		ifp->if_refflags &= ~IFRF_READY;
8091 	}
8092 }
8093 
8094 void
ifnet_datamov_suspend(struct ifnet * ifp)8095 ifnet_datamov_suspend(struct ifnet *ifp)
8096 {
8097 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8098 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8099 	ifnet_datamov_suspend_locked(ifp);
8100 	lck_mtx_unlock(&ifp->if_ref_lock);
8101 }
8102 
8103 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8104 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8105 {
8106 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8107 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8108 	if (ifp->if_suspend > 0) {
8109 		lck_mtx_unlock(&ifp->if_ref_lock);
8110 		return FALSE;
8111 	}
8112 	ifnet_datamov_suspend_locked(ifp);
8113 	lck_mtx_unlock(&ifp->if_ref_lock);
8114 	return TRUE;
8115 }
8116 
8117 void
ifnet_datamov_drain(struct ifnet * ifp)8118 ifnet_datamov_drain(struct ifnet *ifp)
8119 {
8120 	lck_mtx_lock(&ifp->if_ref_lock);
8121 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8122 	/* data movement must already be suspended */
8123 	VERIFY(ifp->if_suspend > 0);
8124 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8125 	ifp->if_drainers++;
8126 	while (ifp->if_datamov != 0) {
8127 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8128 		    if_name(ifp));
8129 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8130 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8131 		    (PZERO - 1), __func__, NULL);
8132 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8133 	}
8134 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8135 	VERIFY(ifp->if_drainers > 0);
8136 	ifp->if_drainers--;
8137 	lck_mtx_unlock(&ifp->if_ref_lock);
8138 
8139 	/* purge the interface queues */
8140 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8141 		if_qflush_snd(ifp, false);
8142 	}
8143 }
8144 
8145 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8146 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8147 {
8148 	ifnet_datamov_suspend(ifp);
8149 	ifnet_datamov_drain(ifp);
8150 }
8151 
8152 void
ifnet_datamov_resume(struct ifnet * ifp)8153 ifnet_datamov_resume(struct ifnet *ifp)
8154 {
8155 	lck_mtx_lock(&ifp->if_ref_lock);
8156 	/* data movement must already be suspended */
8157 	VERIFY(ifp->if_suspend > 0);
8158 	if (--ifp->if_suspend == 0) {
8159 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8160 		ifp->if_refflags |= IFRF_READY;
8161 	}
8162 	ifnet_decr_iorefcnt_locked(ifp);
8163 	lck_mtx_unlock(&ifp->if_ref_lock);
8164 }
8165 
8166 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8167 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8168 {
8169 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8170 	ctrace_t *tr;
8171 	u_int32_t idx;
8172 	u_int16_t *cnt;
8173 
8174 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8175 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8176 		/* NOTREACHED */
8177 	}
8178 
8179 	if (refhold) {
8180 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8181 		tr = dl_if_dbg->dldbg_if_refhold;
8182 	} else {
8183 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8184 		tr = dl_if_dbg->dldbg_if_refrele;
8185 	}
8186 
8187 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8188 	ctrace_record(&tr[idx]);
8189 }
8190 
8191 errno_t
dlil_if_ref(struct ifnet * ifp)8192 dlil_if_ref(struct ifnet *ifp)
8193 {
8194 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8195 
8196 	if (dl_if == NULL) {
8197 		return EINVAL;
8198 	}
8199 
8200 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8201 	++dl_if->dl_if_refcnt;
8202 	if (dl_if->dl_if_refcnt == 0) {
8203 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8204 		/* NOTREACHED */
8205 	}
8206 	if (dl_if->dl_if_trace != NULL) {
8207 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8208 	}
8209 	lck_mtx_unlock(&dl_if->dl_if_lock);
8210 
8211 	return 0;
8212 }
8213 
8214 errno_t
dlil_if_free(struct ifnet * ifp)8215 dlil_if_free(struct ifnet *ifp)
8216 {
8217 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8218 	bool need_release = FALSE;
8219 
8220 	if (dl_if == NULL) {
8221 		return EINVAL;
8222 	}
8223 
8224 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8225 	switch (dl_if->dl_if_refcnt) {
8226 	case 0:
8227 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8228 		/* NOTREACHED */
8229 		break;
8230 	case 1:
8231 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8232 			need_release = TRUE;
8233 		}
8234 		break;
8235 	default:
8236 		break;
8237 	}
8238 	--dl_if->dl_if_refcnt;
8239 	if (dl_if->dl_if_trace != NULL) {
8240 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8241 	}
8242 	lck_mtx_unlock(&dl_if->dl_if_lock);
8243 	if (need_release) {
8244 		_dlil_if_release(ifp, true);
8245 	}
8246 	return 0;
8247 }
8248 
8249 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8250 dlil_attach_protocol(struct if_proto *proto,
8251     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8252     uint32_t * proto_count)
8253 {
8254 	struct kev_dl_proto_data ev_pr_data;
8255 	struct ifnet *ifp = proto->ifp;
8256 	errno_t retval = 0;
8257 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8258 	struct if_proto *prev_proto;
8259 	struct if_proto *_proto;
8260 
8261 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8262 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8263 		return EINVAL;
8264 	}
8265 
8266 	if (!ifnet_is_attached(ifp, 1)) {
8267 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8268 		    __func__, if_name(ifp));
8269 		return ENXIO;
8270 	}
8271 	/* callee holds a proto refcnt upon success */
8272 	ifnet_lock_exclusive(ifp);
8273 	_proto = find_attached_proto(ifp, proto->protocol_family);
8274 	if (_proto != NULL) {
8275 		ifnet_lock_done(ifp);
8276 		if_proto_free(_proto);
8277 		retval = EEXIST;
8278 		goto ioref_done;
8279 	}
8280 
8281 	/*
8282 	 * Call family module add_proto routine so it can refine the
8283 	 * demux descriptors as it wishes.
8284 	 */
8285 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8286 	    demux_count);
8287 	if (retval) {
8288 		ifnet_lock_done(ifp);
8289 		goto ioref_done;
8290 	}
8291 
8292 	/*
8293 	 * Insert the protocol in the hash
8294 	 */
8295 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8296 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8297 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8298 	}
8299 	if (prev_proto) {
8300 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8301 	} else {
8302 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8303 		    proto, next_hash);
8304 	}
8305 
8306 	/* hold a proto refcnt for attach */
8307 	if_proto_ref(proto);
8308 
8309 	/*
8310 	 * The reserved field carries the number of protocol still attached
8311 	 * (subject to change)
8312 	 */
8313 	ev_pr_data.proto_family = proto->protocol_family;
8314 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8315 
8316 	ifnet_lock_done(ifp);
8317 
8318 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8319 	    (struct net_event_data *)&ev_pr_data,
8320 	    sizeof(struct kev_dl_proto_data), FALSE);
8321 	if (proto_count != NULL) {
8322 		*proto_count = ev_pr_data.proto_remaining_count;
8323 	}
8324 ioref_done:
8325 	ifnet_decr_iorefcnt(ifp);
8326 	return retval;
8327 }
8328 
8329 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8330 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8331 {
8332 	/*
8333 	 * A protocol has been attached, mark the interface up.
8334 	 * This used to be done by configd.KernelEventMonitor, but that
8335 	 * is inherently prone to races (rdar://problem/30810208).
8336 	 */
8337 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8338 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8339 	dlil_post_sifflags_msg(ifp);
8340 #if SKYWALK
8341 	switch (protocol) {
8342 	case AF_INET:
8343 	case AF_INET6:
8344 		/* don't attach the flowswitch unless attaching IP */
8345 		dlil_attach_flowswitch_nexus(ifp);
8346 		break;
8347 	default:
8348 		break;
8349 	}
8350 #endif /* SKYWALK */
8351 }
8352 
8353 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8354 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8355     const struct ifnet_attach_proto_param *proto_details)
8356 {
8357 	int retval = 0;
8358 	struct if_proto  *ifproto = NULL;
8359 	uint32_t proto_count = 0;
8360 
8361 	ifnet_head_lock_shared();
8362 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8363 		retval = EINVAL;
8364 		goto end;
8365 	}
8366 	/* Check that the interface is in the global list */
8367 	if (!ifnet_lookup(ifp)) {
8368 		retval = ENXIO;
8369 		goto end;
8370 	}
8371 
8372 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8373 
8374 	/* refcnt held above during lookup */
8375 	ifproto->ifp = ifp;
8376 	ifproto->protocol_family = protocol;
8377 	ifproto->proto_kpi = kProtoKPI_v1;
8378 	ifproto->kpi.v1.input = proto_details->input;
8379 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8380 	ifproto->kpi.v1.event = proto_details->event;
8381 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8382 	ifproto->kpi.v1.detached = proto_details->detached;
8383 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8384 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8385 
8386 	retval = dlil_attach_protocol(ifproto,
8387 	    proto_details->demux_list, proto_details->demux_count,
8388 	    &proto_count);
8389 
8390 end:
8391 	if (retval == EEXIST) {
8392 		/* already attached */
8393 		if (dlil_verbose) {
8394 			DLIL_PRINTF("%s: protocol %d already attached\n",
8395 			    ifp != NULL ? if_name(ifp) : "N/A",
8396 			    protocol);
8397 		}
8398 	} else if (retval != 0) {
8399 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8400 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8401 	} else if (dlil_verbose) {
8402 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8403 		    ifp != NULL ? if_name(ifp) : "N/A",
8404 		    protocol, proto_count);
8405 	}
8406 	ifnet_head_done();
8407 	if (retval == 0) {
8408 		dlil_handle_proto_attach(ifp, protocol);
8409 	} else if (ifproto != NULL) {
8410 		zfree(dlif_proto_zone, ifproto);
8411 	}
8412 	return retval;
8413 }
8414 
8415 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8416 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8417     const struct ifnet_attach_proto_param_v2 *proto_details)
8418 {
8419 	int retval = 0;
8420 	struct if_proto  *ifproto = NULL;
8421 	uint32_t proto_count = 0;
8422 
8423 	ifnet_head_lock_shared();
8424 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8425 		retval = EINVAL;
8426 		goto end;
8427 	}
8428 	/* Check that the interface is in the global list */
8429 	if (!ifnet_lookup(ifp)) {
8430 		retval = ENXIO;
8431 		goto end;
8432 	}
8433 
8434 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8435 
8436 	/* refcnt held above during lookup */
8437 	ifproto->ifp = ifp;
8438 	ifproto->protocol_family = protocol;
8439 	ifproto->proto_kpi = kProtoKPI_v2;
8440 	ifproto->kpi.v2.input = proto_details->input;
8441 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8442 	ifproto->kpi.v2.event = proto_details->event;
8443 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8444 	ifproto->kpi.v2.detached = proto_details->detached;
8445 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8446 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8447 
8448 	retval = dlil_attach_protocol(ifproto,
8449 	    proto_details->demux_list, proto_details->demux_count,
8450 	    &proto_count);
8451 
8452 end:
8453 	if (retval == EEXIST) {
8454 		/* already attached */
8455 		if (dlil_verbose) {
8456 			DLIL_PRINTF("%s: protocol %d already attached\n",
8457 			    ifp != NULL ? if_name(ifp) : "N/A",
8458 			    protocol);
8459 		}
8460 	} else if (retval != 0) {
8461 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8462 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8463 	} else if (dlil_verbose) {
8464 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8465 		    ifp != NULL ? if_name(ifp) : "N/A",
8466 		    protocol, proto_count);
8467 	}
8468 	ifnet_head_done();
8469 	if (retval == 0) {
8470 		dlil_handle_proto_attach(ifp, protocol);
8471 	} else if (ifproto != NULL) {
8472 		zfree(dlif_proto_zone, ifproto);
8473 	}
8474 	return retval;
8475 }
8476 
8477 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8478 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8479 {
8480 	struct if_proto *proto = NULL;
8481 	int     retval = 0;
8482 
8483 	if (ifp == NULL || proto_family == 0) {
8484 		retval = EINVAL;
8485 		goto end;
8486 	}
8487 
8488 	ifnet_lock_exclusive(ifp);
8489 	/* callee holds a proto refcnt upon success */
8490 	proto = find_attached_proto(ifp, proto_family);
8491 	if (proto == NULL) {
8492 		retval = ENXIO;
8493 		ifnet_lock_done(ifp);
8494 		goto end;
8495 	}
8496 
8497 	/* call family module del_proto */
8498 	if (ifp->if_del_proto) {
8499 		ifp->if_del_proto(ifp, proto->protocol_family);
8500 	}
8501 
8502 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8503 	    proto, if_proto, next_hash);
8504 
8505 	if (proto->proto_kpi == kProtoKPI_v1) {
8506 		proto->kpi.v1.input = ifproto_media_input_v1;
8507 		proto->kpi.v1.pre_output = ifproto_media_preout;
8508 		proto->kpi.v1.event = ifproto_media_event;
8509 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8510 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8511 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8512 	} else {
8513 		proto->kpi.v2.input = ifproto_media_input_v2;
8514 		proto->kpi.v2.pre_output = ifproto_media_preout;
8515 		proto->kpi.v2.event = ifproto_media_event;
8516 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8517 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8518 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8519 	}
8520 	proto->detached = 1;
8521 	ifnet_lock_done(ifp);
8522 
8523 	if (dlil_verbose) {
8524 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8525 		    (proto->proto_kpi == kProtoKPI_v1) ?
8526 		    "v1" : "v2", proto_family);
8527 	}
8528 
8529 	/* release proto refcnt held during protocol attach */
8530 	if_proto_free(proto);
8531 
8532 	/*
8533 	 * Release proto refcnt held during lookup; the rest of
8534 	 * protocol detach steps will happen when the last proto
8535 	 * reference is released.
8536 	 */
8537 	if_proto_free(proto);
8538 
8539 end:
8540 	return retval;
8541 }
8542 
8543 
8544 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8545 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8546     struct mbuf *packet, char *header)
8547 {
8548 #pragma unused(ifp, protocol, packet, header)
8549 	return ENXIO;
8550 }
8551 
8552 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8553 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8554     struct mbuf *packet)
8555 {
8556 #pragma unused(ifp, protocol, packet)
8557 	return ENXIO;
8558 }
8559 
8560 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8561 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8562     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8563     char *link_layer_dest)
8564 {
8565 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8566 	return ENXIO;
8567 }
8568 
8569 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8570 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8571     const struct kev_msg *event)
8572 {
8573 #pragma unused(ifp, protocol, event)
8574 }
8575 
8576 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8577 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8578     unsigned long command, void *argument)
8579 {
8580 #pragma unused(ifp, protocol, command, argument)
8581 	return ENXIO;
8582 }
8583 
8584 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8585 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8586     struct sockaddr_dl *out_ll, size_t ll_len)
8587 {
8588 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8589 	return ENXIO;
8590 }
8591 
8592 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8593 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8594     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8595     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8596 {
8597 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8598 	return ENXIO;
8599 }
8600 
8601 extern int if_next_index(void);
8602 extern int tcp_ecn_outbound;
8603 
8604 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8605 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8606 {
8607 	uint32_t sflags = 0;
8608 	int err;
8609 
8610 	if (if_flowadv) {
8611 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8612 	}
8613 
8614 	if (if_delaybased_queue) {
8615 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8616 	}
8617 
8618 	if (ifp->if_output_sched_model ==
8619 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8620 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8621 	}
8622 	/* Inherit drop limit from the default queue */
8623 	if (ifp->if_snd != ifcq) {
8624 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8625 	}
8626 	/* Initialize transmit queue(s) */
8627 	err = ifclassq_setup(ifcq, ifp, sflags);
8628 	if (err != 0) {
8629 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8630 		    "err=%d", __func__, ifp, err);
8631 		/* NOTREACHED */
8632 	}
8633 }
8634 
8635 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8636 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8637 {
8638 #if SKYWALK
8639 	boolean_t netif_compat;
8640 	if_nexus_netif  nexus_netif;
8641 #endif /* SKYWALK */
8642 	struct ifnet *tmp_if;
8643 	struct ifaddr *ifa;
8644 	struct if_data_internal if_data_saved;
8645 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8646 	struct dlil_threading_info *dl_inp;
8647 	thread_continue_t thfunc = NULL;
8648 	int err;
8649 
8650 	if (ifp == NULL) {
8651 		return EINVAL;
8652 	}
8653 
8654 	/*
8655 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8656 	 * prevent the interface from being configured while it is
8657 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8658 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8659 	 */
8660 	dlil_if_lock();
8661 	ifnet_head_lock_exclusive();
8662 	/* Verify we aren't already on the list */
8663 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8664 		if (tmp_if == ifp) {
8665 			ifnet_head_done();
8666 			dlil_if_unlock();
8667 			return EEXIST;
8668 		}
8669 	}
8670 
8671 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8672 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8673 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8674 		    __func__, ifp);
8675 		/* NOTREACHED */
8676 	}
8677 	lck_mtx_unlock(&ifp->if_ref_lock);
8678 
8679 	ifnet_lock_exclusive(ifp);
8680 
8681 	/* Sanity check */
8682 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8683 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8684 	VERIFY(ifp->if_threads_pending == 0);
8685 
8686 	if (ll_addr != NULL) {
8687 		if (ifp->if_addrlen == 0) {
8688 			ifp->if_addrlen = ll_addr->sdl_alen;
8689 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8690 			ifnet_lock_done(ifp);
8691 			ifnet_head_done();
8692 			dlil_if_unlock();
8693 			return EINVAL;
8694 		}
8695 	}
8696 
8697 	/*
8698 	 * Allow interfaces without protocol families to attach
8699 	 * only if they have the necessary fields filled out.
8700 	 */
8701 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8702 		DLIL_PRINTF("%s: Attempt to attach interface without "
8703 		    "family module - %d\n", __func__, ifp->if_family);
8704 		ifnet_lock_done(ifp);
8705 		ifnet_head_done();
8706 		dlil_if_unlock();
8707 		return ENODEV;
8708 	}
8709 
8710 	/* Allocate protocol hash table */
8711 	VERIFY(ifp->if_proto_hash == NULL);
8712 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8713 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8714 
8715 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8716 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8717 	TAILQ_INIT(&ifp->if_flt_head);
8718 	VERIFY(ifp->if_flt_busy == 0);
8719 	VERIFY(ifp->if_flt_waiters == 0);
8720 	VERIFY(ifp->if_flt_non_os_count == 0);
8721 	VERIFY(ifp->if_flt_no_tso_count == 0);
8722 	lck_mtx_unlock(&ifp->if_flt_lock);
8723 
8724 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8725 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8726 		LIST_INIT(&ifp->if_multiaddrs);
8727 	}
8728 
8729 	VERIFY(ifp->if_allhostsinm == NULL);
8730 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8731 	TAILQ_INIT(&ifp->if_addrhead);
8732 
8733 	if (ifp->if_index == 0) {
8734 		int idx = if_next_index();
8735 
8736 		/*
8737 		 * Since we exhausted the list of
8738 		 * if_index's, try to find an empty slot
8739 		 * in ifindex2ifnet.
8740 		 */
8741 		if (idx == -1 && if_index >= UINT16_MAX) {
8742 			for (int i = 1; i < if_index; i++) {
8743 				if (ifindex2ifnet[i] == NULL &&
8744 				    ifnet_addrs[i - 1] == NULL) {
8745 					idx = i;
8746 					break;
8747 				}
8748 			}
8749 		}
8750 		if (idx == -1) {
8751 			ifp->if_index = 0;
8752 			ifnet_lock_done(ifp);
8753 			ifnet_head_done();
8754 			dlil_if_unlock();
8755 			return ENOBUFS;
8756 		}
8757 		ifp->if_index = (uint16_t)idx;
8758 
8759 		/* the lladdr passed at attach time is the permanent address */
8760 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8761 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8762 			bcopy(CONST_LLADDR(ll_addr),
8763 			    dl_if->dl_if_permanent_ether,
8764 			    ETHER_ADDR_LEN);
8765 			dl_if->dl_if_permanent_ether_is_set = 1;
8766 		}
8767 	}
8768 	/* There should not be anything occupying this slot */
8769 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8770 
8771 	/* allocate (if needed) and initialize a link address */
8772 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8773 	if (ifa == NULL) {
8774 		ifnet_lock_done(ifp);
8775 		ifnet_head_done();
8776 		dlil_if_unlock();
8777 		return ENOBUFS;
8778 	}
8779 
8780 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8781 	ifnet_addrs[ifp->if_index - 1] = ifa;
8782 
8783 	/* make this address the first on the list */
8784 	IFA_LOCK(ifa);
8785 	/* hold a reference for ifnet_addrs[] */
8786 	IFA_ADDREF_LOCKED(ifa);
8787 	/* if_attach_link_ifa() holds a reference for ifa_link */
8788 	if_attach_link_ifa(ifp, ifa);
8789 	IFA_UNLOCK(ifa);
8790 
8791 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8792 	ifindex2ifnet[ifp->if_index] = ifp;
8793 
8794 	/* Hold a reference to the underlying dlil_ifnet */
8795 	ifnet_reference(ifp);
8796 
8797 	/* Clear stats (save and restore other fields that we care) */
8798 	if_data_saved = ifp->if_data;
8799 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8800 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8801 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8802 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8803 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8804 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8805 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8806 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8807 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8808 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8809 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8810 	ifnet_touch_lastchange(ifp);
8811 
8812 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8813 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8814 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8815 
8816 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8817 
8818 	/* Sanity checks on the input thread storage */
8819 	dl_inp = &dl_if->dl_if_inpstorage;
8820 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8821 	VERIFY(dl_inp->dlth_flags == 0);
8822 	VERIFY(dl_inp->dlth_wtot == 0);
8823 	VERIFY(dl_inp->dlth_ifp == NULL);
8824 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8825 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8826 	VERIFY(!dl_inp->dlth_affinity);
8827 	VERIFY(ifp->if_inp == NULL);
8828 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8829 	VERIFY(dl_inp->dlth_strategy == NULL);
8830 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8831 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8832 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8833 
8834 #if IFNET_INPUT_SANITY_CHK
8835 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8836 #endif /* IFNET_INPUT_SANITY_CHK */
8837 
8838 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8839 	dlil_reset_rxpoll_params(ifp);
8840 	/*
8841 	 * A specific DLIL input thread is created per non-loopback interface.
8842 	 */
8843 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8844 		ifp->if_inp = dl_inp;
8845 		ifnet_incr_pending_thread_count(ifp);
8846 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8847 		if (err == ENODEV) {
8848 			VERIFY(thfunc == NULL);
8849 			ifnet_decr_pending_thread_count(ifp);
8850 		} else if (err != 0) {
8851 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8852 			    "err=%d", __func__, ifp, err);
8853 			/* NOTREACHED */
8854 		}
8855 	}
8856 	/*
8857 	 * If the driver supports the new transmit model, calculate flow hash
8858 	 * and create a workloop starter thread to invoke the if_start callback
8859 	 * where the packets may be dequeued and transmitted.
8860 	 */
8861 	if (ifp->if_eflags & IFEF_TXSTART) {
8862 		thread_precedence_policy_data_t info;
8863 		__unused kern_return_t kret;
8864 
8865 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8866 		VERIFY(ifp->if_flowhash != 0);
8867 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8868 
8869 		ifnet_set_start_cycle(ifp, NULL);
8870 		ifp->if_start_active = 0;
8871 		ifp->if_start_req = 0;
8872 		ifp->if_start_flags = 0;
8873 		VERIFY(ifp->if_start != NULL);
8874 		ifnet_incr_pending_thread_count(ifp);
8875 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8876 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8877 			panic_plain("%s: "
8878 			    "ifp=%p couldn't get a start thread; "
8879 			    "err=%d", __func__, ifp, err);
8880 			/* NOTREACHED */
8881 		}
8882 		bzero(&info, sizeof(info));
8883 		info.importance = 1;
8884 		kret = thread_policy_set(ifp->if_start_thread,
8885 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8886 		    THREAD_PRECEDENCE_POLICY_COUNT);
8887 		ASSERT(kret == KERN_SUCCESS);
8888 	} else {
8889 		ifp->if_flowhash = 0;
8890 	}
8891 
8892 	/* Reset polling parameters */
8893 	ifnet_set_poll_cycle(ifp, NULL);
8894 	ifp->if_poll_update = 0;
8895 	ifp->if_poll_flags = 0;
8896 	ifp->if_poll_req = 0;
8897 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8898 
8899 	/*
8900 	 * If the driver supports the new receive model, create a poller
8901 	 * thread to invoke if_input_poll callback where the packets may
8902 	 * be dequeued from the driver and processed for reception.
8903 	 * if the interface is netif compat then the poller thread is
8904 	 * managed by netif.
8905 	 */
8906 	if (thfunc == dlil_rxpoll_input_thread_func) {
8907 		thread_precedence_policy_data_t info;
8908 		__unused kern_return_t kret;
8909 #if SKYWALK
8910 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8911 #endif /* SKYWALK */
8912 		VERIFY(ifp->if_input_poll != NULL);
8913 		VERIFY(ifp->if_input_ctl != NULL);
8914 		ifnet_incr_pending_thread_count(ifp);
8915 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8916 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8917 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8918 			    "err=%d", __func__, ifp, err);
8919 			/* NOTREACHED */
8920 		}
8921 		bzero(&info, sizeof(info));
8922 		info.importance = 1;
8923 		kret = thread_policy_set(ifp->if_poll_thread,
8924 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8925 		    THREAD_PRECEDENCE_POLICY_COUNT);
8926 		ASSERT(kret == KERN_SUCCESS);
8927 	}
8928 
8929 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8930 	VERIFY(ifp->if_desc.ifd_len == 0);
8931 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8932 
8933 	/* Record attach PC stacktrace */
8934 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8935 
8936 	ifp->if_updatemcasts = 0;
8937 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8938 		struct ifmultiaddr *ifma;
8939 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8940 			IFMA_LOCK(ifma);
8941 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8942 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8943 				ifp->if_updatemcasts++;
8944 			}
8945 			IFMA_UNLOCK(ifma);
8946 		}
8947 
8948 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8949 		    "membership(s)\n", if_name(ifp),
8950 		    ifp->if_updatemcasts);
8951 	}
8952 
8953 	/* Clear logging parameters */
8954 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8955 
8956 	/* Clear foreground/realtime activity timestamps */
8957 	ifp->if_fg_sendts = 0;
8958 	ifp->if_rt_sendts = 0;
8959 
8960 	/* Clear throughput estimates and radio type */
8961 	ifp->if_estimated_up_bucket = 0;
8962 	ifp->if_estimated_down_bucket = 0;
8963 	ifp->if_radio_type = 0;
8964 	ifp->if_radio_channel = 0;
8965 
8966 	VERIFY(ifp->if_delegated.ifp == NULL);
8967 	VERIFY(ifp->if_delegated.type == 0);
8968 	VERIFY(ifp->if_delegated.family == 0);
8969 	VERIFY(ifp->if_delegated.subfamily == 0);
8970 	VERIFY(ifp->if_delegated.expensive == 0);
8971 	VERIFY(ifp->if_delegated.constrained == 0);
8972 
8973 	VERIFY(ifp->if_agentids == NULL);
8974 	VERIFY(ifp->if_agentcount == 0);
8975 
8976 	/* Reset interface state */
8977 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8978 	ifp->if_interface_state.valid_bitmask |=
8979 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8980 	ifp->if_interface_state.interface_availability =
8981 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8982 
8983 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8984 	if (ifp == lo_ifp) {
8985 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8986 		ifp->if_interface_state.valid_bitmask |=
8987 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8988 	} else {
8989 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8990 	}
8991 
8992 	/*
8993 	 * Enable ECN capability on this interface depending on the
8994 	 * value of ECN global setting
8995 	 */
8996 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8997 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8998 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8999 	}
9000 
9001 	/*
9002 	 * Built-in Cyclops always on policy for WiFi infra
9003 	 */
9004 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9005 		errno_t error;
9006 
9007 		error = if_set_qosmarking_mode(ifp,
9008 		    IFRTYPE_QOSMARKING_FASTLANE);
9009 		if (error != 0) {
9010 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9011 			    __func__, ifp->if_xname, error);
9012 		} else {
9013 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9014 #if (DEVELOPMENT || DEBUG)
9015 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9016 			    __func__, ifp->if_xname);
9017 #endif /* (DEVELOPMENT || DEBUG) */
9018 		}
9019 	}
9020 
9021 	ifnet_lock_done(ifp);
9022 	ifnet_head_done();
9023 
9024 #if SKYWALK
9025 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9026 #endif /* SKYWALK */
9027 
9028 	lck_mtx_lock(&ifp->if_cached_route_lock);
9029 	/* Enable forwarding cached route */
9030 	ifp->if_fwd_cacheok = 1;
9031 	/* Clean up any existing cached routes */
9032 	ROUTE_RELEASE(&ifp->if_fwd_route);
9033 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9034 	ROUTE_RELEASE(&ifp->if_src_route);
9035 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9036 	ROUTE_RELEASE(&ifp->if_src_route6);
9037 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9038 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9039 
9040 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9041 
9042 	/*
9043 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9044 	 * and trees; do this before the ifnet is marked as attached.
9045 	 * The ifnet keeps the reference to the info structures even after
9046 	 * the ifnet is detached, since the network-layer records still
9047 	 * refer to the info structures even after that.  This also
9048 	 * makes it possible for them to still function after the ifnet
9049 	 * is recycled or reattached.
9050 	 */
9051 #if INET
9052 	if (IGMP_IFINFO(ifp) == NULL) {
9053 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9054 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9055 	} else {
9056 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9057 		igmp_domifreattach(IGMP_IFINFO(ifp));
9058 	}
9059 #endif /* INET */
9060 	if (MLD_IFINFO(ifp) == NULL) {
9061 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9062 		VERIFY(MLD_IFINFO(ifp) != NULL);
9063 	} else {
9064 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9065 		mld_domifreattach(MLD_IFINFO(ifp));
9066 	}
9067 
9068 	VERIFY(ifp->if_data_threshold == 0);
9069 	VERIFY(ifp->if_dt_tcall != NULL);
9070 
9071 	/*
9072 	 * Wait for the created kernel threads for I/O to get
9073 	 * scheduled and run at least once before we proceed
9074 	 * to mark interface as attached.
9075 	 */
9076 	lck_mtx_lock(&ifp->if_ref_lock);
9077 	while (ifp->if_threads_pending != 0) {
9078 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9079 		    "interface %s to get scheduled at least once.\n",
9080 		    __func__, ifp->if_xname);
9081 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9082 		    __func__, NULL);
9083 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9084 	}
9085 	lck_mtx_unlock(&ifp->if_ref_lock);
9086 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9087 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9088 
9089 	/* Final mark this ifnet as attached. */
9090 	ifnet_lock_exclusive(ifp);
9091 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9092 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9093 	lck_mtx_unlock(&ifp->if_ref_lock);
9094 	if (net_rtref) {
9095 		/* boot-args override; enable idle notification */
9096 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9097 		    IFRF_IDLE_NOTIFY);
9098 	} else {
9099 		/* apply previous request(s) to set the idle flags, if any */
9100 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9101 		    ifp->if_idle_new_flags_mask);
9102 	}
9103 #if SKYWALK
9104 	/* the interface is fully attached; let the nexus adapter know */
9105 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9106 		if (netif_compat) {
9107 			if (sk_netif_compat_txmodel ==
9108 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9109 				ifnet_enqueue_multi_setup(ifp,
9110 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9111 			}
9112 			ifp->if_nx_netif = nexus_netif;
9113 		}
9114 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9115 	}
9116 #endif /* SKYWALK */
9117 	ifnet_lock_done(ifp);
9118 	dlil_if_unlock();
9119 
9120 #if PF
9121 	/*
9122 	 * Attach packet filter to this interface, if enabled.
9123 	 */
9124 	pf_ifnet_hook(ifp, 1);
9125 #endif /* PF */
9126 
9127 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9128 
9129 	if (dlil_verbose) {
9130 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9131 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9132 	}
9133 
9134 	return 0;
9135 }
9136 
9137 /*
9138  * Prepare the storage for the first/permanent link address, which must
9139  * must have the same lifetime as the ifnet itself.  Although the link
9140  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9141  * its location in memory must never change as it may still be referred
9142  * to by some parts of the system afterwards (unfortunate implementation
9143  * artifacts inherited from BSD.)
9144  *
9145  * Caller must hold ifnet lock as writer.
9146  */
9147 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9148 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9149 {
9150 	struct ifaddr *ifa, *oifa;
9151 	struct sockaddr_dl *asdl, *msdl;
9152 	char workbuf[IFNAMSIZ * 2];
9153 	int namelen, masklen, socksize;
9154 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9155 
9156 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9157 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9158 
9159 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9160 	    if_name(ifp));
9161 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9162 	    + ((namelen > 0) ? namelen : 0);
9163 	socksize = masklen + ifp->if_addrlen;
9164 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9165 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9166 		socksize = sizeof(struct sockaddr_dl);
9167 	}
9168 	socksize = ROUNDUP(socksize);
9169 #undef ROUNDUP
9170 
9171 	ifa = ifp->if_lladdr;
9172 	if (socksize > DLIL_SDLMAXLEN ||
9173 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9174 		/*
9175 		 * Rare, but in the event that the link address requires
9176 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9177 		 * largest possible storages for address and mask, such
9178 		 * that we can reuse the same space when if_addrlen grows.
9179 		 * This same space will be used when if_addrlen shrinks.
9180 		 */
9181 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9182 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9183 
9184 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9185 			ifa_lock_init(ifa);
9186 			/* Don't set IFD_ALLOC, as this is permanent */
9187 			ifa->ifa_debug = IFD_LINK;
9188 		}
9189 		IFA_LOCK(ifa);
9190 		/* address and mask sockaddr_dl locations */
9191 		asdl = (struct sockaddr_dl *)(ifa + 1);
9192 		bzero(asdl, SOCK_MAXADDRLEN);
9193 		msdl = (struct sockaddr_dl *)(void *)
9194 		    ((char *)asdl + SOCK_MAXADDRLEN);
9195 		bzero(msdl, SOCK_MAXADDRLEN);
9196 	} else {
9197 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9198 		/*
9199 		 * Use the storage areas for address and mask within the
9200 		 * dlil_ifnet structure.  This is the most common case.
9201 		 */
9202 		if (ifa == NULL) {
9203 			ifa = &dl_if->dl_if_lladdr.ifa;
9204 			ifa_lock_init(ifa);
9205 			/* Don't set IFD_ALLOC, as this is permanent */
9206 			ifa->ifa_debug = IFD_LINK;
9207 		}
9208 		IFA_LOCK(ifa);
9209 		/* address and mask sockaddr_dl locations */
9210 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9211 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9212 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9213 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9214 	}
9215 
9216 	/* hold a permanent reference for the ifnet itself */
9217 	IFA_ADDREF_LOCKED(ifa);
9218 	oifa = ifp->if_lladdr;
9219 	ifp->if_lladdr = ifa;
9220 
9221 	VERIFY(ifa->ifa_debug == IFD_LINK);
9222 	ifa->ifa_ifp = ifp;
9223 	ifa->ifa_rtrequest = link_rtrequest;
9224 	ifa->ifa_addr = (struct sockaddr *)asdl;
9225 	asdl->sdl_len = (u_char)socksize;
9226 	asdl->sdl_family = AF_LINK;
9227 	if (namelen > 0) {
9228 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9229 		    sizeof(asdl->sdl_data)));
9230 		asdl->sdl_nlen = (u_char)namelen;
9231 	} else {
9232 		asdl->sdl_nlen = 0;
9233 	}
9234 	asdl->sdl_index = ifp->if_index;
9235 	asdl->sdl_type = ifp->if_type;
9236 	if (ll_addr != NULL) {
9237 		asdl->sdl_alen = ll_addr->sdl_alen;
9238 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9239 	} else {
9240 		asdl->sdl_alen = 0;
9241 	}
9242 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9243 	msdl->sdl_len = (u_char)masklen;
9244 	while (namelen > 0) {
9245 		msdl->sdl_data[--namelen] = 0xff;
9246 	}
9247 	IFA_UNLOCK(ifa);
9248 
9249 	if (oifa != NULL) {
9250 		IFA_REMREF(oifa);
9251 	}
9252 
9253 	return ifa;
9254 }
9255 
9256 static void
if_purgeaddrs(struct ifnet * ifp)9257 if_purgeaddrs(struct ifnet *ifp)
9258 {
9259 #if INET
9260 	in_purgeaddrs(ifp);
9261 #endif /* INET */
9262 	in6_purgeaddrs(ifp);
9263 }
9264 
9265 errno_t
ifnet_detach(ifnet_t ifp)9266 ifnet_detach(ifnet_t ifp)
9267 {
9268 	struct ifnet *delegated_ifp;
9269 	struct nd_ifinfo *ndi = NULL;
9270 
9271 	if (ifp == NULL) {
9272 		return EINVAL;
9273 	}
9274 
9275 	ndi = ND_IFINFO(ifp);
9276 	if (NULL != ndi) {
9277 		ndi->cga_initialized = FALSE;
9278 	}
9279 
9280 	/* Mark the interface down */
9281 	if_down(ifp);
9282 
9283 	/*
9284 	 * IMPORTANT NOTE
9285 	 *
9286 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9287 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9288 	 * until after we've waited for all I/O references to drain
9289 	 * in ifnet_detach_final().
9290 	 */
9291 
9292 	ifnet_head_lock_exclusive();
9293 	ifnet_lock_exclusive(ifp);
9294 
9295 	if (ifp->if_output_netem != NULL) {
9296 		netem_destroy(ifp->if_output_netem);
9297 		ifp->if_output_netem = NULL;
9298 	}
9299 
9300 	/*
9301 	 * Check to see if this interface has previously triggered
9302 	 * aggressive protocol draining; if so, decrement the global
9303 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9304 	 * there are no more of such an interface around.
9305 	 */
9306 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9307 
9308 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9309 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9310 		lck_mtx_unlock(&ifp->if_ref_lock);
9311 		ifnet_lock_done(ifp);
9312 		ifnet_head_done();
9313 		return EINVAL;
9314 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9315 		/* Interface has already been detached */
9316 		lck_mtx_unlock(&ifp->if_ref_lock);
9317 		ifnet_lock_done(ifp);
9318 		ifnet_head_done();
9319 		return ENXIO;
9320 	}
9321 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9322 	/* Indicate this interface is being detached */
9323 	ifp->if_refflags &= ~IFRF_ATTACHED;
9324 	ifp->if_refflags |= IFRF_DETACHING;
9325 	lck_mtx_unlock(&ifp->if_ref_lock);
9326 
9327 	if (dlil_verbose) {
9328 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9329 	}
9330 
9331 	/* clean up flow control entry object if there's any */
9332 	if (ifp->if_eflags & IFEF_TXSTART) {
9333 		ifnet_flowadv(ifp->if_flowhash);
9334 	}
9335 
9336 	/* Reset ECN enable/disable flags */
9337 	/* Reset CLAT46 flag */
9338 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9339 
9340 	/*
9341 	 * We do not reset the TCP keep alive counters in case
9342 	 * a TCP connection stays connection after the interface
9343 	 * went down
9344 	 */
9345 	if (ifp->if_tcp_kao_cnt > 0) {
9346 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9347 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9348 	}
9349 	ifp->if_tcp_kao_max = 0;
9350 
9351 	/*
9352 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9353 	 * no longer be visible during lookups from this point.
9354 	 */
9355 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9356 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9357 	ifp->if_link.tqe_next = NULL;
9358 	ifp->if_link.tqe_prev = NULL;
9359 	if (ifp->if_ordered_link.tqe_next != NULL ||
9360 	    ifp->if_ordered_link.tqe_prev != NULL) {
9361 		ifnet_remove_from_ordered_list(ifp);
9362 	}
9363 	ifindex2ifnet[ifp->if_index] = NULL;
9364 
9365 	/* 18717626 - reset router mode */
9366 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9367 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9368 
9369 	/* Record detach PC stacktrace */
9370 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9371 
9372 	/* Clear logging parameters */
9373 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9374 
9375 	/* Clear delegated interface info (reference released below) */
9376 	delegated_ifp = ifp->if_delegated.ifp;
9377 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9378 
9379 	/* Reset interface state */
9380 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9381 
9382 	ifnet_lock_done(ifp);
9383 	ifnet_head_done();
9384 
9385 	/* Release reference held on the delegated interface */
9386 	if (delegated_ifp != NULL) {
9387 		ifnet_release(delegated_ifp);
9388 	}
9389 
9390 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9391 	if (ifp != lo_ifp) {
9392 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9393 	}
9394 
9395 	/* Reset TCP local statistics */
9396 	if (ifp->if_tcp_stat != NULL) {
9397 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9398 	}
9399 
9400 	/* Reset UDP local statistics */
9401 	if (ifp->if_udp_stat != NULL) {
9402 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9403 	}
9404 
9405 	/* Reset ifnet IPv4 stats */
9406 	if (ifp->if_ipv4_stat != NULL) {
9407 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9408 	}
9409 
9410 	/* Reset ifnet IPv6 stats */
9411 	if (ifp->if_ipv6_stat != NULL) {
9412 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9413 	}
9414 
9415 	/* Release memory held for interface link status report */
9416 	if (ifp->if_link_status != NULL) {
9417 		kfree_type(struct if_link_status, ifp->if_link_status);
9418 		ifp->if_link_status = NULL;
9419 	}
9420 
9421 	/* Let BPF know we're detaching */
9422 	bpfdetach(ifp);
9423 
9424 	/* Disable forwarding cached route */
9425 	lck_mtx_lock(&ifp->if_cached_route_lock);
9426 	ifp->if_fwd_cacheok = 0;
9427 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9428 
9429 	/* Disable data threshold and wait for any pending event posting */
9430 	ifp->if_data_threshold = 0;
9431 	VERIFY(ifp->if_dt_tcall != NULL);
9432 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9433 
9434 	/*
9435 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9436 	 * references to the info structures and leave them attached to
9437 	 * this ifnet.
9438 	 */
9439 #if INET
9440 	igmp_domifdetach(ifp);
9441 #endif /* INET */
9442 	mld_domifdetach(ifp);
9443 
9444 #if SKYWALK
9445 	/* Clean up any netns tokens still pointing to to this ifnet */
9446 	netns_ifnet_detach(ifp);
9447 #endif /* SKYWALK */
9448 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9449 
9450 	/* Let worker thread take care of the rest, to avoid reentrancy */
9451 	dlil_if_lock();
9452 	ifnet_detaching_enqueue(ifp);
9453 	dlil_if_unlock();
9454 
9455 	return 0;
9456 }
9457 
9458 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9459 ifnet_detaching_enqueue(struct ifnet *ifp)
9460 {
9461 	dlil_if_lock_assert();
9462 
9463 	++ifnet_detaching_cnt;
9464 	VERIFY(ifnet_detaching_cnt != 0);
9465 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9466 	wakeup((caddr_t)&ifnet_delayed_run);
9467 }
9468 
9469 static struct ifnet *
ifnet_detaching_dequeue(void)9470 ifnet_detaching_dequeue(void)
9471 {
9472 	struct ifnet *ifp;
9473 
9474 	dlil_if_lock_assert();
9475 
9476 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9477 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9478 	if (ifp != NULL) {
9479 		VERIFY(ifnet_detaching_cnt != 0);
9480 		--ifnet_detaching_cnt;
9481 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9482 		ifp->if_detaching_link.tqe_next = NULL;
9483 		ifp->if_detaching_link.tqe_prev = NULL;
9484 	}
9485 	return ifp;
9486 }
9487 
9488 __attribute__((noreturn))
9489 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9490 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9491 {
9492 #pragma unused(v, wres)
9493 	struct ifnet *ifp;
9494 
9495 	dlil_if_lock();
9496 	if (__improbable(ifnet_detaching_embryonic)) {
9497 		ifnet_detaching_embryonic = FALSE;
9498 		/* there's no lock ordering constrain so OK to do this here */
9499 		dlil_decr_pending_thread_count();
9500 	}
9501 
9502 	for (;;) {
9503 		dlil_if_lock_assert();
9504 
9505 		if (ifnet_detaching_cnt == 0) {
9506 			break;
9507 		}
9508 
9509 		net_update_uptime();
9510 
9511 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9512 
9513 		/* Take care of detaching ifnet */
9514 		ifp = ifnet_detaching_dequeue();
9515 		if (ifp != NULL) {
9516 			dlil_if_unlock();
9517 			ifnet_detach_final(ifp);
9518 			dlil_if_lock();
9519 		}
9520 	}
9521 
9522 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9523 	dlil_if_unlock();
9524 	(void) thread_block(ifnet_detacher_thread_cont);
9525 
9526 	VERIFY(0);      /* we should never get here */
9527 	/* NOTREACHED */
9528 	__builtin_unreachable();
9529 }
9530 
9531 __dead2
9532 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9533 ifnet_detacher_thread_func(void *v, wait_result_t w)
9534 {
9535 #pragma unused(v, w)
9536 	dlil_if_lock();
9537 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9538 	ifnet_detaching_embryonic = TRUE;
9539 	/* wake up once to get out of embryonic state */
9540 	wakeup((caddr_t)&ifnet_delayed_run);
9541 	dlil_if_unlock();
9542 	(void) thread_block(ifnet_detacher_thread_cont);
9543 	VERIFY(0);
9544 	/* NOTREACHED */
9545 	__builtin_unreachable();
9546 }
9547 
9548 static void
ifnet_detach_final(struct ifnet * ifp)9549 ifnet_detach_final(struct ifnet *ifp)
9550 {
9551 	struct ifnet_filter *filter, *filter_next;
9552 	struct dlil_ifnet *dlifp;
9553 	struct ifnet_filter_head fhead;
9554 	struct dlil_threading_info *inp;
9555 	struct ifaddr *ifa;
9556 	ifnet_detached_func if_free;
9557 	int i;
9558 
9559 #if SKYWALK
9560 	dlil_netif_detach_notify(ifp);
9561 	/*
9562 	 * Wait for the datapath to quiesce before tearing down
9563 	 * netif/flowswitch nexuses.
9564 	 */
9565 	dlil_quiesce_and_detach_nexuses(ifp);
9566 #endif /* SKYWALK */
9567 
9568 	lck_mtx_lock(&ifp->if_ref_lock);
9569 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9570 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9571 		    __func__, ifp);
9572 		/* NOTREACHED */
9573 	}
9574 
9575 	/*
9576 	 * Wait until the existing IO references get released
9577 	 * before we proceed with ifnet_detach.  This is not a
9578 	 * common case, so block without using a continuation.
9579 	 */
9580 	while (ifp->if_refio > 0) {
9581 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9582 		    "to be released\n", __func__, if_name(ifp));
9583 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9584 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9585 	}
9586 
9587 	VERIFY(ifp->if_datamov == 0);
9588 	VERIFY(ifp->if_drainers == 0);
9589 	VERIFY(ifp->if_suspend == 0);
9590 	ifp->if_refflags &= ~IFRF_READY;
9591 	lck_mtx_unlock(&ifp->if_ref_lock);
9592 
9593 	/* Clear agent IDs */
9594 	if (ifp->if_agentids != NULL) {
9595 		kfree_data(ifp->if_agentids,
9596 		    sizeof(uuid_t) * ifp->if_agentcount);
9597 		ifp->if_agentids = NULL;
9598 	}
9599 	ifp->if_agentcount = 0;
9600 
9601 #if SKYWALK
9602 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9603 #endif /* SKYWALK */
9604 	/* Drain and destroy send queue */
9605 	ifclassq_teardown(ifp->if_snd);
9606 
9607 	/* Detach interface filters */
9608 	lck_mtx_lock(&ifp->if_flt_lock);
9609 	if_flt_monitor_enter(ifp);
9610 
9611 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9612 	fhead = ifp->if_flt_head;
9613 	TAILQ_INIT(&ifp->if_flt_head);
9614 
9615 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9616 		filter_next = TAILQ_NEXT(filter, filt_next);
9617 		lck_mtx_unlock(&ifp->if_flt_lock);
9618 
9619 		dlil_detach_filter_internal(filter, 1);
9620 		lck_mtx_lock(&ifp->if_flt_lock);
9621 	}
9622 	if_flt_monitor_leave(ifp);
9623 	lck_mtx_unlock(&ifp->if_flt_lock);
9624 
9625 	/* Tell upper layers to drop their network addresses */
9626 	if_purgeaddrs(ifp);
9627 
9628 	ifnet_lock_exclusive(ifp);
9629 
9630 	/* Unplumb all protocols */
9631 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9632 		struct if_proto *proto;
9633 
9634 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9635 		while (proto != NULL) {
9636 			protocol_family_t family = proto->protocol_family;
9637 			ifnet_lock_done(ifp);
9638 			proto_unplumb(family, ifp);
9639 			ifnet_lock_exclusive(ifp);
9640 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9641 		}
9642 		/* There should not be any protocols left */
9643 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9644 	}
9645 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9646 	ifp->if_proto_hash = NULL;
9647 
9648 	/* Detach (permanent) link address from if_addrhead */
9649 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9650 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9651 	IFA_LOCK(ifa);
9652 	if_detach_link_ifa(ifp, ifa);
9653 	IFA_UNLOCK(ifa);
9654 
9655 	/* Remove (permanent) link address from ifnet_addrs[] */
9656 	IFA_REMREF(ifa);
9657 	ifnet_addrs[ifp->if_index - 1] = NULL;
9658 
9659 	/* This interface should not be on {ifnet_head,detaching} */
9660 	VERIFY(ifp->if_link.tqe_next == NULL);
9661 	VERIFY(ifp->if_link.tqe_prev == NULL);
9662 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9663 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9664 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9665 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9666 
9667 	/* The slot should have been emptied */
9668 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9669 
9670 	/* There should not be any addresses left */
9671 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9672 
9673 	/*
9674 	 * Signal the starter thread to terminate itself, and wait until
9675 	 * it has exited.
9676 	 */
9677 	if (ifp->if_start_thread != THREAD_NULL) {
9678 		lck_mtx_lock_spin(&ifp->if_start_lock);
9679 		ifp->if_start_flags |= IFSF_TERMINATING;
9680 		wakeup_one((caddr_t)&ifp->if_start_thread);
9681 		lck_mtx_unlock(&ifp->if_start_lock);
9682 
9683 		/* wait for starter thread to terminate */
9684 		lck_mtx_lock(&ifp->if_start_lock);
9685 		while (ifp->if_start_thread != THREAD_NULL) {
9686 			if (dlil_verbose) {
9687 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9688 				    __func__,
9689 				    if_name(ifp));
9690 			}
9691 			(void) msleep(&ifp->if_start_thread,
9692 			    &ifp->if_start_lock, (PZERO - 1),
9693 			    "ifnet_start_thread_exit", NULL);
9694 		}
9695 		lck_mtx_unlock(&ifp->if_start_lock);
9696 		if (dlil_verbose) {
9697 			DLIL_PRINTF("%s: %s starter thread termination complete",
9698 			    __func__, if_name(ifp));
9699 		}
9700 	}
9701 
9702 	/*
9703 	 * Signal the poller thread to terminate itself, and wait until
9704 	 * it has exited.
9705 	 */
9706 	if (ifp->if_poll_thread != THREAD_NULL) {
9707 #if SKYWALK
9708 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9709 #endif /* SKYWALK */
9710 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9711 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9712 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9713 		lck_mtx_unlock(&ifp->if_poll_lock);
9714 
9715 		/* wait for poller thread to terminate */
9716 		lck_mtx_lock(&ifp->if_poll_lock);
9717 		while (ifp->if_poll_thread != THREAD_NULL) {
9718 			if (dlil_verbose) {
9719 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9720 				    __func__,
9721 				    if_name(ifp));
9722 			}
9723 			(void) msleep(&ifp->if_poll_thread,
9724 			    &ifp->if_poll_lock, (PZERO - 1),
9725 			    "ifnet_poll_thread_exit", NULL);
9726 		}
9727 		lck_mtx_unlock(&ifp->if_poll_lock);
9728 		if (dlil_verbose) {
9729 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9730 			    __func__, if_name(ifp));
9731 		}
9732 	}
9733 
9734 	/*
9735 	 * If thread affinity was set for the workloop thread, we will need
9736 	 * to tear down the affinity and release the extra reference count
9737 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9738 	 * without dedicated input threads.
9739 	 */
9740 	if ((inp = ifp->if_inp) != NULL) {
9741 		VERIFY(inp != dlil_main_input_thread);
9742 
9743 		if (inp->dlth_affinity) {
9744 			struct thread *tp, *wtp, *ptp;
9745 
9746 			lck_mtx_lock_spin(&inp->dlth_lock);
9747 			wtp = inp->dlth_driver_thread;
9748 			inp->dlth_driver_thread = THREAD_NULL;
9749 			ptp = inp->dlth_poller_thread;
9750 			inp->dlth_poller_thread = THREAD_NULL;
9751 			ASSERT(inp->dlth_thread != THREAD_NULL);
9752 			tp = inp->dlth_thread;    /* don't nullify now */
9753 			inp->dlth_affinity_tag = 0;
9754 			inp->dlth_affinity = FALSE;
9755 			lck_mtx_unlock(&inp->dlth_lock);
9756 
9757 			/* Tear down poll thread affinity */
9758 			if (ptp != NULL) {
9759 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9760 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9761 				(void) dlil_affinity_set(ptp,
9762 				    THREAD_AFFINITY_TAG_NULL);
9763 				thread_deallocate(ptp);
9764 			}
9765 
9766 			/* Tear down workloop thread affinity */
9767 			if (wtp != NULL) {
9768 				(void) dlil_affinity_set(wtp,
9769 				    THREAD_AFFINITY_TAG_NULL);
9770 				thread_deallocate(wtp);
9771 			}
9772 
9773 			/* Tear down DLIL input thread affinity */
9774 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9775 			thread_deallocate(tp);
9776 		}
9777 
9778 		/* disassociate ifp DLIL input thread */
9779 		ifp->if_inp = NULL;
9780 
9781 		/* if the worker thread was created, tell it to terminate */
9782 		if (inp->dlth_thread != THREAD_NULL) {
9783 			lck_mtx_lock_spin(&inp->dlth_lock);
9784 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9785 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9786 				wakeup_one((caddr_t)&inp->dlth_flags);
9787 			}
9788 			lck_mtx_unlock(&inp->dlth_lock);
9789 			ifnet_lock_done(ifp);
9790 
9791 			/* wait for the input thread to terminate */
9792 			lck_mtx_lock_spin(&inp->dlth_lock);
9793 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9794 			    == 0) {
9795 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9796 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9797 			}
9798 			lck_mtx_unlock(&inp->dlth_lock);
9799 			ifnet_lock_exclusive(ifp);
9800 		}
9801 
9802 		/* clean-up input thread state */
9803 		dlil_clean_threading_info(inp);
9804 		/* clean-up poll parameters */
9805 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9806 		dlil_reset_rxpoll_params(ifp);
9807 	}
9808 
9809 	/* The driver might unload, so point these to ourselves */
9810 	if_free = ifp->if_free;
9811 	ifp->if_output_dlil = ifp_if_output;
9812 	ifp->if_output = ifp_if_output;
9813 	ifp->if_pre_enqueue = ifp_if_output;
9814 	ifp->if_start = ifp_if_start;
9815 	ifp->if_output_ctl = ifp_if_ctl;
9816 	ifp->if_input_dlil = ifp_if_input;
9817 	ifp->if_input_poll = ifp_if_input_poll;
9818 	ifp->if_input_ctl = ifp_if_ctl;
9819 	ifp->if_ioctl = ifp_if_ioctl;
9820 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9821 	ifp->if_free = ifp_if_free;
9822 	ifp->if_demux = ifp_if_demux;
9823 	ifp->if_event = ifp_if_event;
9824 	ifp->if_framer_legacy = ifp_if_framer;
9825 	ifp->if_framer = ifp_if_framer_extended;
9826 	ifp->if_add_proto = ifp_if_add_proto;
9827 	ifp->if_del_proto = ifp_if_del_proto;
9828 	ifp->if_check_multi = ifp_if_check_multi;
9829 
9830 	/* wipe out interface description */
9831 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9832 	ifp->if_desc.ifd_len = 0;
9833 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9834 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9835 
9836 	/* there shouldn't be any delegation by now */
9837 	VERIFY(ifp->if_delegated.ifp == NULL);
9838 	VERIFY(ifp->if_delegated.type == 0);
9839 	VERIFY(ifp->if_delegated.family == 0);
9840 	VERIFY(ifp->if_delegated.subfamily == 0);
9841 	VERIFY(ifp->if_delegated.expensive == 0);
9842 	VERIFY(ifp->if_delegated.constrained == 0);
9843 
9844 	/* QoS marking get cleared */
9845 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9846 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9847 
9848 #if SKYWALK
9849 	/* the nexus destructor is responsible for clearing these */
9850 	VERIFY(ifp->if_na_ops == NULL);
9851 	VERIFY(ifp->if_na == NULL);
9852 #endif /* SKYWALK */
9853 
9854 	/* promiscuous/allmulti counts need to start at zero again */
9855 	ifp->if_pcount = 0;
9856 	ifp->if_amcount = 0;
9857 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9858 
9859 	ifnet_lock_done(ifp);
9860 
9861 #if PF
9862 	/*
9863 	 * Detach this interface from packet filter, if enabled.
9864 	 */
9865 	pf_ifnet_hook(ifp, 0);
9866 #endif /* PF */
9867 
9868 	/* Filter list should be empty */
9869 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9870 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9871 	VERIFY(ifp->if_flt_busy == 0);
9872 	VERIFY(ifp->if_flt_waiters == 0);
9873 	VERIFY(ifp->if_flt_non_os_count == 0);
9874 	VERIFY(ifp->if_flt_no_tso_count == 0);
9875 	lck_mtx_unlock(&ifp->if_flt_lock);
9876 
9877 	/* Last chance to drain send queue */
9878 	if_qflush_snd(ifp, 0);
9879 
9880 	/* Last chance to cleanup any cached route */
9881 	lck_mtx_lock(&ifp->if_cached_route_lock);
9882 	VERIFY(!ifp->if_fwd_cacheok);
9883 	ROUTE_RELEASE(&ifp->if_fwd_route);
9884 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9885 	ROUTE_RELEASE(&ifp->if_src_route);
9886 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9887 	ROUTE_RELEASE(&ifp->if_src_route6);
9888 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9889 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9890 
9891 	VERIFY(ifp->if_data_threshold == 0);
9892 	VERIFY(ifp->if_dt_tcall != NULL);
9893 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9894 
9895 	ifnet_llreach_ifdetach(ifp);
9896 
9897 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9898 
9899 	/*
9900 	 * Finally, mark this ifnet as detached.
9901 	 */
9902 	if (dlil_verbose) {
9903 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9904 	}
9905 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9906 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9907 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9908 		    __func__, ifp);
9909 		/* NOTREACHED */
9910 	}
9911 	ifp->if_refflags &= ~IFRF_DETACHING;
9912 	lck_mtx_unlock(&ifp->if_ref_lock);
9913 	if (if_free != NULL) {
9914 		if_free(ifp);
9915 	}
9916 
9917 	ifclassq_release(&ifp->if_snd);
9918 
9919 	/* we're fully detached, clear the "in use" bit */
9920 	dlifp = (struct dlil_ifnet *)ifp;
9921 	lck_mtx_lock(&dlifp->dl_if_lock);
9922 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9923 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9924 	lck_mtx_unlock(&dlifp->dl_if_lock);
9925 
9926 	/* Release reference held during ifnet attach */
9927 	ifnet_release(ifp);
9928 }
9929 
9930 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9931 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9932 {
9933 #pragma unused(ifp)
9934 	m_freem_list(m);
9935 	return 0;
9936 }
9937 
9938 void
ifp_if_start(struct ifnet * ifp)9939 ifp_if_start(struct ifnet *ifp)
9940 {
9941 	ifnet_purge(ifp);
9942 }
9943 
9944 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9945 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9946     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9947     boolean_t poll, struct thread *tp)
9948 {
9949 #pragma unused(ifp, m_tail, s, poll, tp)
9950 	m_freem_list(m_head);
9951 	return ENXIO;
9952 }
9953 
9954 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9955 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9956     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9957 {
9958 #pragma unused(ifp, flags, max_cnt)
9959 	if (m_head != NULL) {
9960 		*m_head = NULL;
9961 	}
9962 	if (m_tail != NULL) {
9963 		*m_tail = NULL;
9964 	}
9965 	if (cnt != NULL) {
9966 		*cnt = 0;
9967 	}
9968 	if (len != NULL) {
9969 		*len = 0;
9970 	}
9971 }
9972 
9973 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9974 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9975 {
9976 #pragma unused(ifp, cmd, arglen, arg)
9977 	return EOPNOTSUPP;
9978 }
9979 
9980 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9981 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9982 {
9983 #pragma unused(ifp, fh, pf)
9984 	m_freem(m);
9985 	return EJUSTRETURN;
9986 }
9987 
9988 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9989 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9990     const struct ifnet_demux_desc *da, u_int32_t dc)
9991 {
9992 #pragma unused(ifp, pf, da, dc)
9993 	return EINVAL;
9994 }
9995 
9996 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9997 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9998 {
9999 #pragma unused(ifp, pf)
10000 	return EINVAL;
10001 }
10002 
10003 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10004 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10005 {
10006 #pragma unused(ifp, sa)
10007 	return EOPNOTSUPP;
10008 }
10009 
10010 #if !XNU_TARGET_OS_OSX
10011 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10012 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10013     const struct sockaddr *sa, const char *ll, const char *t,
10014     u_int32_t *pre, u_int32_t *post)
10015 #else /* XNU_TARGET_OS_OSX */
10016 static errno_t
10017 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10018     const struct sockaddr *sa, const char *ll, const char *t)
10019 #endif /* XNU_TARGET_OS_OSX */
10020 {
10021 #pragma unused(ifp, m, sa, ll, t)
10022 #if !XNU_TARGET_OS_OSX
10023 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10024 #else /* XNU_TARGET_OS_OSX */
10025 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10026 #endif /* XNU_TARGET_OS_OSX */
10027 }
10028 
10029 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10030 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10031     const struct sockaddr *sa, const char *ll, const char *t,
10032     u_int32_t *pre, u_int32_t *post)
10033 {
10034 #pragma unused(ifp, sa, ll, t)
10035 	m_freem(*m);
10036 	*m = NULL;
10037 
10038 	if (pre != NULL) {
10039 		*pre = 0;
10040 	}
10041 	if (post != NULL) {
10042 		*post = 0;
10043 	}
10044 
10045 	return EJUSTRETURN;
10046 }
10047 
10048 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10049 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10050 {
10051 #pragma unused(ifp, cmd, arg)
10052 	return EOPNOTSUPP;
10053 }
10054 
10055 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10056 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10057 {
10058 #pragma unused(ifp, tm, f)
10059 	/* XXX not sure what to do here */
10060 	return 0;
10061 }
10062 
10063 static void
ifp_if_free(struct ifnet * ifp)10064 ifp_if_free(struct ifnet *ifp)
10065 {
10066 #pragma unused(ifp)
10067 }
10068 
10069 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10070 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10071 {
10072 #pragma unused(ifp, e)
10073 }
10074 
10075 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10076 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10077     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10078 {
10079 	struct ifnet *ifp1 = NULL;
10080 	struct dlil_ifnet *dlifp1 = NULL;
10081 	struct dlil_ifnet *dlifp1_saved = NULL;
10082 	void *buf, *base, **pbuf;
10083 	int ret = 0;
10084 
10085 	VERIFY(*ifp == NULL);
10086 	dlil_if_lock();
10087 	/*
10088 	 * We absolutely can't have an interface with the same name
10089 	 * in in-use state.
10090 	 * To make sure of that list has to be traversed completely
10091 	 */
10092 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10093 		ifp1 = (struct ifnet *)dlifp1;
10094 
10095 		if (ifp1->if_family != family) {
10096 			continue;
10097 		}
10098 
10099 		/*
10100 		 * If interface is in use, return EBUSY if either unique id
10101 		 * or interface extended names are the same
10102 		 */
10103 		lck_mtx_lock(&dlifp1->dl_if_lock);
10104 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10105 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10106 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10107 			ret = EBUSY;
10108 			goto end;
10109 		}
10110 
10111 		if (uniqueid_len != 0 &&
10112 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10113 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10114 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10115 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10116 				ret = EBUSY;
10117 				goto end;
10118 			}
10119 			if (dlifp1_saved == NULL) {
10120 				/* cache the first match */
10121 				dlifp1_saved = dlifp1;
10122 			}
10123 			/*
10124 			 * Do not break or jump to end as we have to traverse
10125 			 * the whole list to ensure there are no name collisions
10126 			 */
10127 		}
10128 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10129 	}
10130 
10131 	/* If there's an interface that can be recycled, use that */
10132 	if (dlifp1_saved != NULL) {
10133 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10134 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10135 			/* some other thread got in ahead of us */
10136 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10137 			ret = EBUSY;
10138 			goto end;
10139 		}
10140 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10141 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10142 		*ifp = (struct ifnet *)dlifp1_saved;
10143 		dlil_if_ref(*ifp);
10144 		goto end;
10145 	}
10146 
10147 	/* no interface found, allocate a new one */
10148 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10149 
10150 	/* Get the 64-bit aligned base address for this object */
10151 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10152 	    sizeof(u_int64_t));
10153 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10154 
10155 	/*
10156 	 * Wind back a pointer size from the aligned base and
10157 	 * save the original address so we can free it later.
10158 	 */
10159 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10160 	*pbuf = buf;
10161 	dlifp1 = base;
10162 
10163 	if (uniqueid_len) {
10164 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10165 		    Z_WAITOK);
10166 		if (dlifp1->dl_if_uniqueid == NULL) {
10167 			zfree(dlif_zone, buf);
10168 			ret = ENOMEM;
10169 			goto end;
10170 		}
10171 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10172 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10173 	}
10174 
10175 	ifp1 = (struct ifnet *)dlifp1;
10176 	dlifp1->dl_if_flags = DLIF_INUSE;
10177 	if (ifnet_debug) {
10178 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10179 		dlifp1->dl_if_trace = dlil_if_trace;
10180 	}
10181 	ifp1->if_name = dlifp1->dl_if_namestorage;
10182 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10183 
10184 	/* initialize interface description */
10185 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10186 	ifp1->if_desc.ifd_len = 0;
10187 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10188 
10189 #if SKYWALK
10190 	SLIST_INIT(&ifp1->if_netns_tokens);
10191 #endif /* SKYWALK */
10192 
10193 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10194 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10195 		    "error: %d\n", __func__, ret);
10196 		/* This probably shouldn't be fatal */
10197 		ret = 0;
10198 	}
10199 
10200 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10202 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10203 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10204 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10205 	    &ifnet_lock_attr);
10206 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10207 #if INET
10208 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10209 	    &ifnet_lock_attr);
10210 	ifp1->if_inetdata = NULL;
10211 #endif
10212 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10213 	ifp1->if_inet6_ioctl_busy = FALSE;
10214 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10215 	    &ifnet_lock_attr);
10216 	ifp1->if_inet6data = NULL;
10217 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10218 	    &ifnet_lock_attr);
10219 	ifp1->if_link_status = NULL;
10220 
10221 	/* for send data paths */
10222 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10223 	    &ifnet_lock_attr);
10224 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10225 	    &ifnet_lock_attr);
10226 
10227 	/* for receive data paths */
10228 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10229 	    &ifnet_lock_attr);
10230 
10231 	/* thread call allocation is done with sleeping zalloc */
10232 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10233 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10234 	if (ifp1->if_dt_tcall == NULL) {
10235 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10236 		/* NOTREACHED */
10237 	}
10238 
10239 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10240 
10241 	*ifp = ifp1;
10242 	dlil_if_ref(*ifp);
10243 
10244 end:
10245 	dlil_if_unlock();
10246 
10247 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10248 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10249 
10250 	return ret;
10251 }
10252 
10253 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10254 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10255 {
10256 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10257 
10258 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10259 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10260 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10261 	}
10262 
10263 	ifnet_lock_exclusive(ifp);
10264 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10265 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10266 		ifp->if_broadcast.length = 0;
10267 		ifp->if_broadcast.u.ptr = NULL;
10268 	}
10269 	lck_mtx_lock(&dlifp->dl_if_lock);
10270 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10271 	ifp->if_name = dlifp->dl_if_namestorage;
10272 	/* Reset external name (name + unit) */
10273 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10274 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10275 	    "%s?", ifp->if_name);
10276 	if (clear_in_use) {
10277 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10278 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10279 	}
10280 	lck_mtx_unlock(&dlifp->dl_if_lock);
10281 	ifnet_lock_done(ifp);
10282 }
10283 
10284 __private_extern__ void
dlil_if_release(ifnet_t ifp)10285 dlil_if_release(ifnet_t ifp)
10286 {
10287 	_dlil_if_release(ifp, false);
10288 }
10289 
10290 __private_extern__ void
dlil_if_lock(void)10291 dlil_if_lock(void)
10292 {
10293 	lck_mtx_lock(&dlil_ifnet_lock);
10294 }
10295 
10296 __private_extern__ void
dlil_if_unlock(void)10297 dlil_if_unlock(void)
10298 {
10299 	lck_mtx_unlock(&dlil_ifnet_lock);
10300 }
10301 
10302 __private_extern__ void
dlil_if_lock_assert(void)10303 dlil_if_lock_assert(void)
10304 {
10305 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10306 }
10307 
10308 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10309 dlil_proto_unplumb_all(struct ifnet *ifp)
10310 {
10311 	/*
10312 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10313 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10314 	 * explicit unplumb.
10315 	 *
10316 	 * if_proto_hash[3] is for other protocols; we expect anything
10317 	 * in this bucket to respond to the DETACHING event (which would
10318 	 * have happened by now) and do the unplumb then.
10319 	 */
10320 	(void) proto_unplumb(PF_INET, ifp);
10321 	(void) proto_unplumb(PF_INET6, ifp);
10322 }
10323 
10324 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10325 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10326 {
10327 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10328 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10329 
10330 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10331 
10332 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10333 }
10334 
10335 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10336 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10337 {
10338 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10339 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10340 
10341 	if (ifp->if_fwd_cacheok) {
10342 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10343 	} else {
10344 		ROUTE_RELEASE(src);
10345 	}
10346 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10347 }
10348 
10349 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10350 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10351 {
10352 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10353 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10354 
10355 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10356 	    sizeof(*dst));
10357 
10358 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10359 }
10360 
10361 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10362 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10363 {
10364 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10365 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10366 
10367 	if (ifp->if_fwd_cacheok) {
10368 		route_copyin((struct route *)src,
10369 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10370 	} else {
10371 		ROUTE_RELEASE(src);
10372 	}
10373 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10374 }
10375 
10376 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10377 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10378 {
10379 	struct route            src_rt;
10380 	struct sockaddr_in      *dst;
10381 
10382 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10383 
10384 	ifp_src_route_copyout(ifp, &src_rt);
10385 
10386 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10387 		ROUTE_RELEASE(&src_rt);
10388 		if (dst->sin_family != AF_INET) {
10389 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10390 			dst->sin_len = sizeof(src_rt.ro_dst);
10391 			dst->sin_family = AF_INET;
10392 		}
10393 		dst->sin_addr = src_ip;
10394 
10395 		VERIFY(src_rt.ro_rt == NULL);
10396 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10397 		    0, 0, ifp->if_index);
10398 
10399 		if (src_rt.ro_rt != NULL) {
10400 			/* retain a ref, copyin consumes one */
10401 			struct rtentry  *rte = src_rt.ro_rt;
10402 			RT_ADDREF(rte);
10403 			ifp_src_route_copyin(ifp, &src_rt);
10404 			src_rt.ro_rt = rte;
10405 		}
10406 	}
10407 
10408 	return src_rt.ro_rt;
10409 }
10410 
10411 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10412 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10413 {
10414 	struct route_in6 src_rt;
10415 
10416 	ifp_src_route6_copyout(ifp, &src_rt);
10417 
10418 	if (ROUTE_UNUSABLE(&src_rt) ||
10419 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10420 		ROUTE_RELEASE(&src_rt);
10421 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10422 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10423 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10424 			src_rt.ro_dst.sin6_family = AF_INET6;
10425 		}
10426 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10427 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10428 		    sizeof(src_rt.ro_dst.sin6_addr));
10429 
10430 		if (src_rt.ro_rt == NULL) {
10431 			src_rt.ro_rt = rtalloc1_scoped(
10432 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10433 				ifp->if_index);
10434 
10435 			if (src_rt.ro_rt != NULL) {
10436 				/* retain a ref, copyin consumes one */
10437 				struct rtentry  *rte = src_rt.ro_rt;
10438 				RT_ADDREF(rte);
10439 				ifp_src_route6_copyin(ifp, &src_rt);
10440 				src_rt.ro_rt = rte;
10441 			}
10442 		}
10443 	}
10444 
10445 	return src_rt.ro_rt;
10446 }
10447 
10448 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10449 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10450 {
10451 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10452 
10453 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10454 
10455 	/* Normalize to edge */
10456 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10457 		lqm = IFNET_LQM_THRESH_ABORT;
10458 		atomic_bitset_32(&tcbinfo.ipi_flags,
10459 		    INPCBINFO_HANDLE_LQM_ABORT);
10460 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10461 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10462 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10463 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10464 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10465 	    lqm <= IFNET_LQM_THRESH_POOR) {
10466 		lqm = IFNET_LQM_THRESH_POOR;
10467 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10468 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10469 		lqm = IFNET_LQM_THRESH_GOOD;
10470 	}
10471 
10472 	/*
10473 	 * Take the lock if needed
10474 	 */
10475 	if (!locked) {
10476 		ifnet_lock_exclusive(ifp);
10477 	}
10478 
10479 	if (lqm == ifp->if_interface_state.lqm_state &&
10480 	    (ifp->if_interface_state.valid_bitmask &
10481 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10482 		/*
10483 		 * Release the lock if was not held by the caller
10484 		 */
10485 		if (!locked) {
10486 			ifnet_lock_done(ifp);
10487 		}
10488 		return;         /* nothing to update */
10489 	}
10490 	ifp->if_interface_state.valid_bitmask |=
10491 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10492 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10493 
10494 	/*
10495 	 * Don't want to hold the lock when issuing kernel events
10496 	 */
10497 	ifnet_lock_done(ifp);
10498 
10499 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10500 	ev_lqm_data.link_quality_metric = lqm;
10501 
10502 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10503 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10504 
10505 	/*
10506 	 * Reacquire the lock for the caller
10507 	 */
10508 	if (locked) {
10509 		ifnet_lock_exclusive(ifp);
10510 	}
10511 }
10512 
10513 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10514 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10515 {
10516 	struct kev_dl_rrc_state kev;
10517 
10518 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10519 	    (ifp->if_interface_state.valid_bitmask &
10520 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10521 		return;
10522 	}
10523 
10524 	ifp->if_interface_state.valid_bitmask |=
10525 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10526 
10527 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10528 
10529 	/*
10530 	 * Don't want to hold the lock when issuing kernel events
10531 	 */
10532 	ifnet_lock_done(ifp);
10533 
10534 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10535 	kev.rrc_state = rrc_state;
10536 
10537 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10538 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10539 
10540 	ifnet_lock_exclusive(ifp);
10541 }
10542 
10543 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10544 if_state_update(struct ifnet *ifp,
10545     struct if_interface_state *if_interface_state)
10546 {
10547 	u_short if_index_available = 0;
10548 
10549 	ifnet_lock_exclusive(ifp);
10550 
10551 	if ((ifp->if_type != IFT_CELLULAR) &&
10552 	    (if_interface_state->valid_bitmask &
10553 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10554 		ifnet_lock_done(ifp);
10555 		return ENOTSUP;
10556 	}
10557 	if ((if_interface_state->valid_bitmask &
10558 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10559 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10560 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10561 		ifnet_lock_done(ifp);
10562 		return EINVAL;
10563 	}
10564 	if ((if_interface_state->valid_bitmask &
10565 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10566 	    if_interface_state->rrc_state !=
10567 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10568 	    if_interface_state->rrc_state !=
10569 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10570 		ifnet_lock_done(ifp);
10571 		return EINVAL;
10572 	}
10573 
10574 	if (if_interface_state->valid_bitmask &
10575 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10576 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10577 	}
10578 	if (if_interface_state->valid_bitmask &
10579 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10580 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10581 	}
10582 	if (if_interface_state->valid_bitmask &
10583 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10584 		ifp->if_interface_state.valid_bitmask |=
10585 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10586 		ifp->if_interface_state.interface_availability =
10587 		    if_interface_state->interface_availability;
10588 
10589 		if (ifp->if_interface_state.interface_availability ==
10590 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10591 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10592 			    __func__, if_name(ifp), ifp->if_index);
10593 			if_index_available = ifp->if_index;
10594 		} else {
10595 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10596 			    __func__, if_name(ifp), ifp->if_index);
10597 		}
10598 	}
10599 	ifnet_lock_done(ifp);
10600 
10601 	/*
10602 	 * Check if the TCP connections going on this interface should be
10603 	 * forced to send probe packets instead of waiting for TCP timers
10604 	 * to fire. This is done on an explicit notification such as
10605 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10606 	 */
10607 	if (if_index_available > 0) {
10608 		tcp_interface_send_probe(if_index_available);
10609 	}
10610 
10611 	return 0;
10612 }
10613 
10614 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10615 if_get_state(struct ifnet *ifp,
10616     struct if_interface_state *if_interface_state)
10617 {
10618 	ifnet_lock_shared(ifp);
10619 
10620 	if_interface_state->valid_bitmask = 0;
10621 
10622 	if (ifp->if_interface_state.valid_bitmask &
10623 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10624 		if_interface_state->valid_bitmask |=
10625 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10626 		if_interface_state->rrc_state =
10627 		    ifp->if_interface_state.rrc_state;
10628 	}
10629 	if (ifp->if_interface_state.valid_bitmask &
10630 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10631 		if_interface_state->valid_bitmask |=
10632 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10633 		if_interface_state->lqm_state =
10634 		    ifp->if_interface_state.lqm_state;
10635 	}
10636 	if (ifp->if_interface_state.valid_bitmask &
10637 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10638 		if_interface_state->valid_bitmask |=
10639 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10640 		if_interface_state->interface_availability =
10641 		    ifp->if_interface_state.interface_availability;
10642 	}
10643 
10644 	ifnet_lock_done(ifp);
10645 }
10646 
10647 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10648 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10649 {
10650 	if (conn_probe > 1) {
10651 		return EINVAL;
10652 	}
10653 	if (conn_probe == 0) {
10654 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10655 	} else {
10656 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10657 	}
10658 
10659 #if NECP
10660 	necp_update_all_clients();
10661 #endif /* NECP */
10662 
10663 	tcp_probe_connectivity(ifp, conn_probe);
10664 	return 0;
10665 }
10666 
10667 /* for uuid.c */
10668 static int
get_ether_index(int * ret_other_index)10669 get_ether_index(int * ret_other_index)
10670 {
10671 	struct ifnet *ifp;
10672 	int en0_index = 0;
10673 	int other_en_index = 0;
10674 	int any_ether_index = 0;
10675 	short best_unit = 0;
10676 
10677 	*ret_other_index = 0;
10678 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10679 		/*
10680 		 * find en0, or if not en0, the lowest unit en*, and if not
10681 		 * that, any ethernet
10682 		 */
10683 		ifnet_lock_shared(ifp);
10684 		if (strcmp(ifp->if_name, "en") == 0) {
10685 			if (ifp->if_unit == 0) {
10686 				/* found en0, we're done */
10687 				en0_index = ifp->if_index;
10688 				ifnet_lock_done(ifp);
10689 				break;
10690 			}
10691 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10692 				other_en_index = ifp->if_index;
10693 				best_unit = ifp->if_unit;
10694 			}
10695 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10696 			any_ether_index = ifp->if_index;
10697 		}
10698 		ifnet_lock_done(ifp);
10699 	}
10700 	if (en0_index == 0) {
10701 		if (other_en_index != 0) {
10702 			*ret_other_index = other_en_index;
10703 		} else if (any_ether_index != 0) {
10704 			*ret_other_index = any_ether_index;
10705 		}
10706 	}
10707 	return en0_index;
10708 }
10709 
10710 int
uuid_get_ethernet(u_int8_t * node)10711 uuid_get_ethernet(u_int8_t *node)
10712 {
10713 	static int en0_index;
10714 	struct ifnet *ifp;
10715 	int other_index = 0;
10716 	int the_index = 0;
10717 	int ret;
10718 
10719 	ifnet_head_lock_shared();
10720 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10721 		en0_index = get_ether_index(&other_index);
10722 	}
10723 	if (en0_index != 0) {
10724 		the_index = en0_index;
10725 	} else if (other_index != 0) {
10726 		the_index = other_index;
10727 	}
10728 	if (the_index != 0) {
10729 		struct dlil_ifnet *dl_if;
10730 
10731 		ifp = ifindex2ifnet[the_index];
10732 		VERIFY(ifp != NULL);
10733 		dl_if = (struct dlil_ifnet *)ifp;
10734 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10735 			/*
10736 			 * Use the permanent ethernet address if it is
10737 			 * available because it will never change.
10738 			 */
10739 			memcpy(node, dl_if->dl_if_permanent_ether,
10740 			    ETHER_ADDR_LEN);
10741 		} else {
10742 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10743 		}
10744 		ret = 0;
10745 	} else {
10746 		ret = -1;
10747 	}
10748 	ifnet_head_done();
10749 	return ret;
10750 }
10751 
10752 static int
10753 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10754 {
10755 #pragma unused(arg1, arg2)
10756 	uint32_t i;
10757 	int err;
10758 
10759 	i = if_rxpoll;
10760 
10761 	err = sysctl_handle_int(oidp, &i, 0, req);
10762 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10763 		return err;
10764 	}
10765 
10766 	if (net_rxpoll == 0) {
10767 		return ENXIO;
10768 	}
10769 
10770 	if_rxpoll = i;
10771 	return err;
10772 }
10773 
10774 static int
10775 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10776 {
10777 #pragma unused(arg1, arg2)
10778 	uint64_t q;
10779 	int err;
10780 
10781 	q = if_rxpoll_mode_holdtime;
10782 
10783 	err = sysctl_handle_quad(oidp, &q, 0, req);
10784 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10785 		return err;
10786 	}
10787 
10788 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10789 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10790 	}
10791 
10792 	if_rxpoll_mode_holdtime = q;
10793 
10794 	return err;
10795 }
10796 
10797 static int
10798 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10799 {
10800 #pragma unused(arg1, arg2)
10801 	uint64_t q;
10802 	int err;
10803 
10804 	q = if_rxpoll_sample_holdtime;
10805 
10806 	err = sysctl_handle_quad(oidp, &q, 0, req);
10807 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10808 		return err;
10809 	}
10810 
10811 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10812 		q = IF_RXPOLL_SAMPLETIME_MIN;
10813 	}
10814 
10815 	if_rxpoll_sample_holdtime = q;
10816 
10817 	return err;
10818 }
10819 
10820 static int
10821 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10822 {
10823 #pragma unused(arg1, arg2)
10824 	uint64_t q;
10825 	int err;
10826 
10827 	q = if_rxpoll_interval_time;
10828 
10829 	err = sysctl_handle_quad(oidp, &q, 0, req);
10830 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10831 		return err;
10832 	}
10833 
10834 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10835 		q = IF_RXPOLL_INTERVALTIME_MIN;
10836 	}
10837 
10838 	if_rxpoll_interval_time = q;
10839 
10840 	return err;
10841 }
10842 
10843 static int
10844 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10845 {
10846 #pragma unused(arg1, arg2)
10847 	uint32_t i;
10848 	int err;
10849 
10850 	i = if_sysctl_rxpoll_wlowat;
10851 
10852 	err = sysctl_handle_int(oidp, &i, 0, req);
10853 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10854 		return err;
10855 	}
10856 
10857 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10858 		return EINVAL;
10859 	}
10860 
10861 	if_sysctl_rxpoll_wlowat = i;
10862 	return err;
10863 }
10864 
10865 static int
10866 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10867 {
10868 #pragma unused(arg1, arg2)
10869 	uint32_t i;
10870 	int err;
10871 
10872 	i = if_sysctl_rxpoll_whiwat;
10873 
10874 	err = sysctl_handle_int(oidp, &i, 0, req);
10875 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10876 		return err;
10877 	}
10878 
10879 	if (i <= if_sysctl_rxpoll_wlowat) {
10880 		return EINVAL;
10881 	}
10882 
10883 	if_sysctl_rxpoll_whiwat = i;
10884 	return err;
10885 }
10886 
10887 static int
10888 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10889 {
10890 #pragma unused(arg1, arg2)
10891 	int i, err;
10892 
10893 	i = if_sndq_maxlen;
10894 
10895 	err = sysctl_handle_int(oidp, &i, 0, req);
10896 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10897 		return err;
10898 	}
10899 
10900 	if (i < IF_SNDQ_MINLEN) {
10901 		i = IF_SNDQ_MINLEN;
10902 	}
10903 
10904 	if_sndq_maxlen = i;
10905 	return err;
10906 }
10907 
10908 static int
10909 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10910 {
10911 #pragma unused(arg1, arg2)
10912 	int i, err;
10913 
10914 	i = if_rcvq_maxlen;
10915 
10916 	err = sysctl_handle_int(oidp, &i, 0, req);
10917 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10918 		return err;
10919 	}
10920 
10921 	if (i < IF_RCVQ_MINLEN) {
10922 		i = IF_RCVQ_MINLEN;
10923 	}
10924 
10925 	if_rcvq_maxlen = i;
10926 	return err;
10927 }
10928 
10929 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10930 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10931     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10932 {
10933 	struct kev_dl_node_presence kev;
10934 	struct sockaddr_dl *sdl;
10935 	struct sockaddr_in6 *sin6;
10936 	int ret = 0;
10937 
10938 	VERIFY(ifp);
10939 	VERIFY(sa);
10940 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10941 
10942 	bzero(&kev, sizeof(kev));
10943 	sin6 = &kev.sin6_node_address;
10944 	sdl = &kev.sdl_node_address;
10945 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10946 	kev.rssi = rssi;
10947 	kev.link_quality_metric = lqm;
10948 	kev.node_proximity_metric = npm;
10949 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10950 
10951 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10952 	if (ret == 0 || ret == EEXIST) {
10953 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10954 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10955 		if (err != 0) {
10956 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10957 			    "error %d\n", __func__, err);
10958 		}
10959 	}
10960 
10961 	if (ret == EEXIST) {
10962 		ret = 0;
10963 	}
10964 	return ret;
10965 }
10966 
10967 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10968 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10969 {
10970 	struct kev_dl_node_absence kev = {};
10971 	struct sockaddr_in6 *kev_sin6 = NULL;
10972 	struct sockaddr_dl *kev_sdl = NULL;
10973 	int error = 0;
10974 
10975 	VERIFY(ifp != NULL);
10976 	VERIFY(sa != NULL);
10977 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10978 
10979 	kev_sin6 = &kev.sin6_node_address;
10980 	kev_sdl = &kev.sdl_node_address;
10981 
10982 	if (sa->sa_family == AF_INET6) {
10983 		/*
10984 		 * If IPv6 address is given, get the link layer
10985 		 * address from what was cached in the neighbor cache
10986 		 */
10987 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10988 		bcopy(sa, kev_sin6, sa->sa_len);
10989 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10990 	} else {
10991 		/*
10992 		 * If passed address is AF_LINK type, derive the address
10993 		 * based on the link address.
10994 		 */
10995 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10996 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10997 	}
10998 
10999 	if (error == 0) {
11000 		kev_sdl->sdl_type = ifp->if_type;
11001 		kev_sdl->sdl_index = ifp->if_index;
11002 
11003 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11004 		    &kev.link_data, sizeof(kev), FALSE);
11005 	}
11006 }
11007 
11008 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11009 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11010     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11011 {
11012 	struct kev_dl_node_presence kev = {};
11013 	struct sockaddr_dl *kev_sdl = NULL;
11014 	struct sockaddr_in6 *kev_sin6 = NULL;
11015 	int ret = 0;
11016 
11017 	VERIFY(ifp != NULL);
11018 	VERIFY(sa != NULL && sdl != NULL);
11019 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11020 
11021 	kev_sin6 = &kev.sin6_node_address;
11022 	kev_sdl = &kev.sdl_node_address;
11023 
11024 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11025 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11026 	kev_sdl->sdl_type = ifp->if_type;
11027 	kev_sdl->sdl_index = ifp->if_index;
11028 
11029 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11030 	bcopy(sa, kev_sin6, sa->sa_len);
11031 
11032 	kev.rssi = rssi;
11033 	kev.link_quality_metric = lqm;
11034 	kev.node_proximity_metric = npm;
11035 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11036 
11037 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11038 	if (ret == 0 || ret == EEXIST) {
11039 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11040 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11041 		if (err != 0) {
11042 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11043 		}
11044 	}
11045 
11046 	if (ret == EEXIST) {
11047 		ret = 0;
11048 	}
11049 	return ret;
11050 }
11051 
11052 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11053 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11054     kauth_cred_t *credp)
11055 {
11056 	const u_int8_t *bytes;
11057 	size_t size;
11058 
11059 	bytes = CONST_LLADDR(sdl);
11060 	size = sdl->sdl_alen;
11061 
11062 #if CONFIG_MACF
11063 	if (dlil_lladdr_ckreq) {
11064 		switch (sdl->sdl_type) {
11065 		case IFT_ETHER:
11066 		case IFT_IEEE1394:
11067 			break;
11068 		default:
11069 			credp = NULL;
11070 			break;
11071 		}
11072 		;
11073 
11074 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11075 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11076 				[0] = 2
11077 			};
11078 
11079 			bytes = unspec;
11080 		}
11081 	}
11082 #else
11083 #pragma unused(credp)
11084 #endif
11085 
11086 	if (sizep != NULL) {
11087 		*sizep = size;
11088 	}
11089 	return bytes;
11090 }
11091 
11092 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11093 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11094     u_int8_t info[DLIL_MODARGLEN])
11095 {
11096 	struct kev_dl_issues kev;
11097 	struct timeval tv;
11098 
11099 	VERIFY(ifp != NULL);
11100 	VERIFY(modid != NULL);
11101 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11102 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11103 
11104 	bzero(&kev, sizeof(kev));
11105 
11106 	microtime(&tv);
11107 	kev.timestamp = tv.tv_sec;
11108 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11109 	if (info != NULL) {
11110 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11111 	}
11112 
11113 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11114 	    &kev.link_data, sizeof(kev), FALSE);
11115 }
11116 
11117 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11118 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11119     struct proc *p)
11120 {
11121 	u_int32_t level = IFNET_THROTTLE_OFF;
11122 	errno_t result = 0;
11123 
11124 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11125 
11126 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11127 		/*
11128 		 * XXX: Use priv_check_cred() instead of root check?
11129 		 */
11130 		if ((result = proc_suser(p)) != 0) {
11131 			return result;
11132 		}
11133 
11134 		if (ifr->ifr_opportunistic.ifo_flags ==
11135 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11136 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11137 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11138 			level = IFNET_THROTTLE_OFF;
11139 		} else {
11140 			result = EINVAL;
11141 		}
11142 
11143 		if (result == 0) {
11144 			result = ifnet_set_throttle(ifp, level);
11145 		}
11146 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11147 		ifr->ifr_opportunistic.ifo_flags = 0;
11148 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11149 			ifr->ifr_opportunistic.ifo_flags |=
11150 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11151 		}
11152 	}
11153 
11154 	/*
11155 	 * Return the count of current opportunistic connections
11156 	 * over the interface.
11157 	 */
11158 	if (result == 0) {
11159 		uint32_t flags = 0;
11160 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11161 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11162 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11163 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11164 		ifr->ifr_opportunistic.ifo_inuse =
11165 		    udp_count_opportunistic(ifp->if_index, flags) +
11166 		    tcp_count_opportunistic(ifp->if_index, flags);
11167 	}
11168 
11169 	if (result == EALREADY) {
11170 		result = 0;
11171 	}
11172 
11173 	return result;
11174 }
11175 
11176 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11177 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11178 {
11179 	struct ifclassq *ifq;
11180 	int err = 0;
11181 
11182 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11183 		return ENXIO;
11184 	}
11185 
11186 	*level = IFNET_THROTTLE_OFF;
11187 
11188 	ifq = ifp->if_snd;
11189 	IFCQ_LOCK(ifq);
11190 	/* Throttling works only for IFCQ, not ALTQ instances */
11191 	if (IFCQ_IS_ENABLED(ifq)) {
11192 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11193 
11194 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11195 		*level = req.level;
11196 	}
11197 	IFCQ_UNLOCK(ifq);
11198 
11199 	return err;
11200 }
11201 
11202 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11203 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11204 {
11205 	struct ifclassq *ifq;
11206 	int err = 0;
11207 
11208 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11209 		return ENXIO;
11210 	}
11211 
11212 	ifq = ifp->if_snd;
11213 
11214 	switch (level) {
11215 	case IFNET_THROTTLE_OFF:
11216 	case IFNET_THROTTLE_OPPORTUNISTIC:
11217 		break;
11218 	default:
11219 		return EINVAL;
11220 	}
11221 
11222 	IFCQ_LOCK(ifq);
11223 	if (IFCQ_IS_ENABLED(ifq)) {
11224 		cqrq_throttle_t req = { 1, level };
11225 
11226 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11227 	}
11228 	IFCQ_UNLOCK(ifq);
11229 
11230 	if (err == 0) {
11231 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11232 		    level);
11233 #if NECP
11234 		necp_update_all_clients();
11235 #endif /* NECP */
11236 		if (level == IFNET_THROTTLE_OFF) {
11237 			ifnet_start(ifp);
11238 		}
11239 	}
11240 
11241 	return err;
11242 }
11243 
11244 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11245 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11246     struct proc *p)
11247 {
11248 #pragma unused(p)
11249 	errno_t result = 0;
11250 	uint32_t flags;
11251 	int level, category, subcategory;
11252 
11253 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11254 
11255 	if (cmd == SIOCSIFLOG) {
11256 		if ((result = priv_check_cred(kauth_cred_get(),
11257 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11258 			return result;
11259 		}
11260 
11261 		level = ifr->ifr_log.ifl_level;
11262 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11263 			result = EINVAL;
11264 		}
11265 
11266 		flags = ifr->ifr_log.ifl_flags;
11267 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11268 			result = EINVAL;
11269 		}
11270 
11271 		category = ifr->ifr_log.ifl_category;
11272 		subcategory = ifr->ifr_log.ifl_subcategory;
11273 
11274 		if (result == 0) {
11275 			result = ifnet_set_log(ifp, level, flags,
11276 			    category, subcategory);
11277 		}
11278 	} else {
11279 		result = ifnet_get_log(ifp, &level, &flags, &category,
11280 		    &subcategory);
11281 		if (result == 0) {
11282 			ifr->ifr_log.ifl_level = level;
11283 			ifr->ifr_log.ifl_flags = flags;
11284 			ifr->ifr_log.ifl_category = category;
11285 			ifr->ifr_log.ifl_subcategory = subcategory;
11286 		}
11287 	}
11288 
11289 	return result;
11290 }
11291 
11292 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11293 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11294     int32_t category, int32_t subcategory)
11295 {
11296 	int err = 0;
11297 
11298 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11299 	VERIFY(flags & IFNET_LOGF_MASK);
11300 
11301 	/*
11302 	 * The logging level applies to all facilities; make sure to
11303 	 * update them all with the most current level.
11304 	 */
11305 	flags |= ifp->if_log.flags;
11306 
11307 	if (ifp->if_output_ctl != NULL) {
11308 		struct ifnet_log_params l;
11309 
11310 		bzero(&l, sizeof(l));
11311 		l.level = level;
11312 		l.flags = flags;
11313 		l.flags &= ~IFNET_LOGF_DLIL;
11314 		l.category = category;
11315 		l.subcategory = subcategory;
11316 
11317 		/* Send this request to lower layers */
11318 		if (l.flags != 0) {
11319 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11320 			    sizeof(l), &l);
11321 		}
11322 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11323 		/*
11324 		 * If targeted to the lower layers without an output
11325 		 * control callback registered on the interface, just
11326 		 * silently ignore facilities other than ours.
11327 		 */
11328 		flags &= IFNET_LOGF_DLIL;
11329 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11330 			level = 0;
11331 		}
11332 	}
11333 
11334 	if (err == 0) {
11335 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11336 			ifp->if_log.flags = 0;
11337 		} else {
11338 			ifp->if_log.flags |= flags;
11339 		}
11340 
11341 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11342 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11343 		    ifp->if_log.level, ifp->if_log.flags,
11344 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11345 		    category, subcategory);
11346 	}
11347 
11348 	return err;
11349 }
11350 
11351 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11352 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11353     int32_t *category, int32_t *subcategory)
11354 {
11355 	if (level != NULL) {
11356 		*level = ifp->if_log.level;
11357 	}
11358 	if (flags != NULL) {
11359 		*flags = ifp->if_log.flags;
11360 	}
11361 	if (category != NULL) {
11362 		*category = ifp->if_log.category;
11363 	}
11364 	if (subcategory != NULL) {
11365 		*subcategory = ifp->if_log.subcategory;
11366 	}
11367 
11368 	return 0;
11369 }
11370 
11371 int
ifnet_notify_address(struct ifnet * ifp,int af)11372 ifnet_notify_address(struct ifnet *ifp, int af)
11373 {
11374 	struct ifnet_notify_address_params na;
11375 
11376 #if PF
11377 	(void) pf_ifaddr_hook(ifp);
11378 #endif /* PF */
11379 
11380 	if (ifp->if_output_ctl == NULL) {
11381 		return EOPNOTSUPP;
11382 	}
11383 
11384 	bzero(&na, sizeof(na));
11385 	na.address_family = (sa_family_t)af;
11386 
11387 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11388 	           sizeof(na), &na);
11389 }
11390 
11391 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11392 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11393 {
11394 	if (ifp == NULL || flowid == NULL) {
11395 		return EINVAL;
11396 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11397 	    !IF_FULLY_ATTACHED(ifp)) {
11398 		return ENXIO;
11399 	}
11400 
11401 	*flowid = ifp->if_flowhash;
11402 
11403 	return 0;
11404 }
11405 
11406 errno_t
ifnet_disable_output(struct ifnet * ifp)11407 ifnet_disable_output(struct ifnet *ifp)
11408 {
11409 	int err;
11410 
11411 	if (ifp == NULL) {
11412 		return EINVAL;
11413 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11414 	    !IF_FULLY_ATTACHED(ifp)) {
11415 		return ENXIO;
11416 	}
11417 
11418 	if ((err = ifnet_fc_add(ifp)) == 0) {
11419 		lck_mtx_lock_spin(&ifp->if_start_lock);
11420 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11421 		lck_mtx_unlock(&ifp->if_start_lock);
11422 	}
11423 	return err;
11424 }
11425 
11426 errno_t
ifnet_enable_output(struct ifnet * ifp)11427 ifnet_enable_output(struct ifnet *ifp)
11428 {
11429 	if (ifp == NULL) {
11430 		return EINVAL;
11431 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11432 	    !IF_FULLY_ATTACHED(ifp)) {
11433 		return ENXIO;
11434 	}
11435 
11436 	ifnet_start_common(ifp, TRUE);
11437 	return 0;
11438 }
11439 
11440 void
ifnet_flowadv(uint32_t flowhash)11441 ifnet_flowadv(uint32_t flowhash)
11442 {
11443 	struct ifnet_fc_entry *ifce;
11444 	struct ifnet *ifp;
11445 
11446 	ifce = ifnet_fc_get(flowhash);
11447 	if (ifce == NULL) {
11448 		return;
11449 	}
11450 
11451 	VERIFY(ifce->ifce_ifp != NULL);
11452 	ifp = ifce->ifce_ifp;
11453 
11454 	/* flow hash gets recalculated per attach, so check */
11455 	if (ifnet_is_attached(ifp, 1)) {
11456 		if (ifp->if_flowhash == flowhash) {
11457 			(void) ifnet_enable_output(ifp);
11458 		}
11459 		ifnet_decr_iorefcnt(ifp);
11460 	}
11461 	ifnet_fc_entry_free(ifce);
11462 }
11463 
11464 /*
11465  * Function to compare ifnet_fc_entries in ifnet flow control tree
11466  */
11467 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11468 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11469 {
11470 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11471 }
11472 
11473 static int
ifnet_fc_add(struct ifnet * ifp)11474 ifnet_fc_add(struct ifnet *ifp)
11475 {
11476 	struct ifnet_fc_entry keyfc, *ifce;
11477 	uint32_t flowhash;
11478 
11479 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11480 	VERIFY(ifp->if_flowhash != 0);
11481 	flowhash = ifp->if_flowhash;
11482 
11483 	bzero(&keyfc, sizeof(keyfc));
11484 	keyfc.ifce_flowhash = flowhash;
11485 
11486 	lck_mtx_lock_spin(&ifnet_fc_lock);
11487 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11488 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11489 		/* Entry is already in ifnet_fc_tree, return */
11490 		lck_mtx_unlock(&ifnet_fc_lock);
11491 		return 0;
11492 	}
11493 
11494 	if (ifce != NULL) {
11495 		/*
11496 		 * There is a different fc entry with the same flow hash
11497 		 * but different ifp pointer.  There can be a collision
11498 		 * on flow hash but the probability is low.  Let's just
11499 		 * avoid adding a second one when there is a collision.
11500 		 */
11501 		lck_mtx_unlock(&ifnet_fc_lock);
11502 		return EAGAIN;
11503 	}
11504 
11505 	/* become regular mutex */
11506 	lck_mtx_convert_spin(&ifnet_fc_lock);
11507 
11508 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11509 	ifce->ifce_flowhash = flowhash;
11510 	ifce->ifce_ifp = ifp;
11511 
11512 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11513 	lck_mtx_unlock(&ifnet_fc_lock);
11514 	return 0;
11515 }
11516 
11517 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11518 ifnet_fc_get(uint32_t flowhash)
11519 {
11520 	struct ifnet_fc_entry keyfc, *ifce;
11521 	struct ifnet *ifp;
11522 
11523 	bzero(&keyfc, sizeof(keyfc));
11524 	keyfc.ifce_flowhash = flowhash;
11525 
11526 	lck_mtx_lock_spin(&ifnet_fc_lock);
11527 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11528 	if (ifce == NULL) {
11529 		/* Entry is not present in ifnet_fc_tree, return */
11530 		lck_mtx_unlock(&ifnet_fc_lock);
11531 		return NULL;
11532 	}
11533 
11534 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11535 
11536 	VERIFY(ifce->ifce_ifp != NULL);
11537 	ifp = ifce->ifce_ifp;
11538 
11539 	/* become regular mutex */
11540 	lck_mtx_convert_spin(&ifnet_fc_lock);
11541 
11542 	if (!ifnet_is_attached(ifp, 0)) {
11543 		/*
11544 		 * This ifp is not attached or in the process of being
11545 		 * detached; just don't process it.
11546 		 */
11547 		ifnet_fc_entry_free(ifce);
11548 		ifce = NULL;
11549 	}
11550 	lck_mtx_unlock(&ifnet_fc_lock);
11551 
11552 	return ifce;
11553 }
11554 
11555 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11556 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11557 {
11558 	zfree(ifnet_fc_zone, ifce);
11559 }
11560 
11561 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11562 ifnet_calc_flowhash(struct ifnet *ifp)
11563 {
11564 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11565 	uint32_t flowhash = 0;
11566 
11567 	if (ifnet_flowhash_seed == 0) {
11568 		ifnet_flowhash_seed = RandomULong();
11569 	}
11570 
11571 	bzero(&fh, sizeof(fh));
11572 
11573 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11574 	fh.ifk_unit = ifp->if_unit;
11575 	fh.ifk_flags = ifp->if_flags;
11576 	fh.ifk_eflags = ifp->if_eflags;
11577 	fh.ifk_capabilities = ifp->if_capabilities;
11578 	fh.ifk_capenable = ifp->if_capenable;
11579 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11580 	fh.ifk_rand1 = RandomULong();
11581 	fh.ifk_rand2 = RandomULong();
11582 
11583 try_again:
11584 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11585 	if (flowhash == 0) {
11586 		/* try to get a non-zero flowhash */
11587 		ifnet_flowhash_seed = RandomULong();
11588 		goto try_again;
11589 	}
11590 
11591 	return flowhash;
11592 }
11593 
11594 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11595 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11596     uint16_t flags, uint8_t *data)
11597 {
11598 #pragma unused(flags)
11599 	int error = 0;
11600 
11601 	switch (family) {
11602 	case AF_INET:
11603 		if_inetdata_lock_exclusive(ifp);
11604 		if (IN_IFEXTRA(ifp) != NULL) {
11605 			if (len == 0) {
11606 				/* Allow clearing the signature */
11607 				IN_IFEXTRA(ifp)->netsig_len = 0;
11608 				bzero(IN_IFEXTRA(ifp)->netsig,
11609 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11610 				if_inetdata_lock_done(ifp);
11611 				break;
11612 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11613 				error = EINVAL;
11614 				if_inetdata_lock_done(ifp);
11615 				break;
11616 			}
11617 			IN_IFEXTRA(ifp)->netsig_len = len;
11618 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11619 		} else {
11620 			error = ENOMEM;
11621 		}
11622 		if_inetdata_lock_done(ifp);
11623 		break;
11624 
11625 	case AF_INET6:
11626 		if_inet6data_lock_exclusive(ifp);
11627 		if (IN6_IFEXTRA(ifp) != NULL) {
11628 			if (len == 0) {
11629 				/* Allow clearing the signature */
11630 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11631 				bzero(IN6_IFEXTRA(ifp)->netsig,
11632 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11633 				if_inet6data_lock_done(ifp);
11634 				break;
11635 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11636 				error = EINVAL;
11637 				if_inet6data_lock_done(ifp);
11638 				break;
11639 			}
11640 			IN6_IFEXTRA(ifp)->netsig_len = len;
11641 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11642 		} else {
11643 			error = ENOMEM;
11644 		}
11645 		if_inet6data_lock_done(ifp);
11646 		break;
11647 
11648 	default:
11649 		error = EINVAL;
11650 		break;
11651 	}
11652 
11653 	return error;
11654 }
11655 
11656 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11657 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11658     uint16_t *flags, uint8_t *data)
11659 {
11660 	int error = 0;
11661 
11662 	if (ifp == NULL || len == NULL || data == NULL) {
11663 		return EINVAL;
11664 	}
11665 
11666 	switch (family) {
11667 	case AF_INET:
11668 		if_inetdata_lock_shared(ifp);
11669 		if (IN_IFEXTRA(ifp) != NULL) {
11670 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11671 				error = EINVAL;
11672 				if_inetdata_lock_done(ifp);
11673 				break;
11674 			}
11675 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11676 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11677 			} else {
11678 				error = ENOENT;
11679 			}
11680 		} else {
11681 			error = ENOMEM;
11682 		}
11683 		if_inetdata_lock_done(ifp);
11684 		break;
11685 
11686 	case AF_INET6:
11687 		if_inet6data_lock_shared(ifp);
11688 		if (IN6_IFEXTRA(ifp) != NULL) {
11689 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11690 				error = EINVAL;
11691 				if_inet6data_lock_done(ifp);
11692 				break;
11693 			}
11694 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11695 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11696 			} else {
11697 				error = ENOENT;
11698 			}
11699 		} else {
11700 			error = ENOMEM;
11701 		}
11702 		if_inet6data_lock_done(ifp);
11703 		break;
11704 
11705 	default:
11706 		error = EINVAL;
11707 		break;
11708 	}
11709 
11710 	if (error == 0 && flags != NULL) {
11711 		*flags = 0;
11712 	}
11713 
11714 	return error;
11715 }
11716 
11717 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11718 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11719 {
11720 	int i, error = 0, one_set = 0;
11721 
11722 	if_inet6data_lock_exclusive(ifp);
11723 
11724 	if (IN6_IFEXTRA(ifp) == NULL) {
11725 		error = ENOMEM;
11726 		goto out;
11727 	}
11728 
11729 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11730 		uint32_t prefix_len =
11731 		    prefixes[i].prefix_len;
11732 		struct in6_addr *prefix =
11733 		    &prefixes[i].ipv6_prefix;
11734 
11735 		if (prefix_len == 0) {
11736 			clat_log0((LOG_DEBUG,
11737 			    "NAT64 prefixes purged from Interface %s\n",
11738 			    if_name(ifp)));
11739 			/* Allow clearing the signature */
11740 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11741 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11742 			    sizeof(struct in6_addr));
11743 
11744 			continue;
11745 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11746 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11747 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11748 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11749 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11750 		    prefix_len != NAT64_PREFIX_LEN_96) {
11751 			clat_log0((LOG_DEBUG,
11752 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11753 			error = EINVAL;
11754 			goto out;
11755 		}
11756 
11757 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11758 			clat_log0((LOG_DEBUG,
11759 			    "NAT64 prefix has interface/link local scope.\n"));
11760 			error = EINVAL;
11761 			goto out;
11762 		}
11763 
11764 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11765 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11766 		    sizeof(struct in6_addr));
11767 		clat_log0((LOG_DEBUG,
11768 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11769 		    ip6_sprintf(prefix), prefix_len));
11770 		one_set = 1;
11771 	}
11772 
11773 out:
11774 	if_inet6data_lock_done(ifp);
11775 
11776 	if (error == 0 && one_set != 0) {
11777 		necp_update_all_clients();
11778 	}
11779 
11780 	return error;
11781 }
11782 
11783 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11784 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11785 {
11786 	int i, found_one = 0, error = 0;
11787 
11788 	if (ifp == NULL) {
11789 		return EINVAL;
11790 	}
11791 
11792 	if_inet6data_lock_shared(ifp);
11793 
11794 	if (IN6_IFEXTRA(ifp) == NULL) {
11795 		error = ENOMEM;
11796 		goto out;
11797 	}
11798 
11799 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11800 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11801 			found_one = 1;
11802 		}
11803 	}
11804 
11805 	if (found_one == 0) {
11806 		error = ENOENT;
11807 		goto out;
11808 	}
11809 
11810 	if (prefixes) {
11811 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11812 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11813 	}
11814 
11815 out:
11816 	if_inet6data_lock_done(ifp);
11817 
11818 	return error;
11819 }
11820 
11821 __attribute__((noinline))
11822 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11823 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11824     protocol_family_t pf)
11825 {
11826 #pragma unused(ifp)
11827 	uint32_t did_sw;
11828 
11829 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11830 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11831 		return;
11832 	}
11833 
11834 	switch (pf) {
11835 	case PF_INET:
11836 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11837 		if (did_sw & CSUM_DELAY_IP) {
11838 			hwcksum_dbg_finalized_hdr++;
11839 		}
11840 		if (did_sw & CSUM_DELAY_DATA) {
11841 			hwcksum_dbg_finalized_data++;
11842 		}
11843 		break;
11844 	case PF_INET6:
11845 		/*
11846 		 * Checksum offload should not have been enabled when
11847 		 * extension headers exist; that also means that we
11848 		 * cannot force-finalize packets with extension headers.
11849 		 * Indicate to the callee should it skip such case by
11850 		 * setting optlen to -1.
11851 		 */
11852 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11853 		    m->m_pkthdr.csum_flags);
11854 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11855 			hwcksum_dbg_finalized_data++;
11856 		}
11857 		break;
11858 	default:
11859 		return;
11860 	}
11861 }
11862 
11863 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11864 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11865     protocol_family_t pf)
11866 {
11867 	uint16_t sum = 0;
11868 	uint32_t hlen;
11869 
11870 	if (frame_header == NULL ||
11871 	    frame_header < (char *)mbuf_datastart(m) ||
11872 	    frame_header > (char *)m->m_data) {
11873 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11874 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11875 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11876 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11877 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11878 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11879 		return;
11880 	}
11881 	hlen = (uint32_t)(m->m_data - frame_header);
11882 
11883 	switch (pf) {
11884 	case PF_INET:
11885 	case PF_INET6:
11886 		break;
11887 	default:
11888 		return;
11889 	}
11890 
11891 	/*
11892 	 * Force partial checksum offload; useful to simulate cases
11893 	 * where the hardware does not support partial checksum offload,
11894 	 * in order to validate correctness throughout the layers above.
11895 	 */
11896 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11897 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11898 
11899 		if (foff > (uint32_t)m->m_pkthdr.len) {
11900 			return;
11901 		}
11902 
11903 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11904 
11905 		/* Compute 16-bit 1's complement sum from forced offset */
11906 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11907 
11908 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11909 		m->m_pkthdr.csum_rx_val = sum;
11910 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11911 
11912 		hwcksum_dbg_partial_forced++;
11913 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11914 	}
11915 
11916 	/*
11917 	 * Partial checksum offload verification (and adjustment);
11918 	 * useful to validate and test cases where the hardware
11919 	 * supports partial checksum offload.
11920 	 */
11921 	if ((m->m_pkthdr.csum_flags &
11922 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11923 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11924 		uint32_t rxoff;
11925 
11926 		/* Start offset must begin after frame header */
11927 		rxoff = m->m_pkthdr.csum_rx_start;
11928 		if (hlen > rxoff) {
11929 			hwcksum_dbg_bad_rxoff++;
11930 			if (dlil_verbose) {
11931 				DLIL_PRINTF("%s: partial cksum start offset %d "
11932 				    "is less than frame header length %d for "
11933 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11934 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11935 			}
11936 			return;
11937 		}
11938 		rxoff -= hlen;
11939 
11940 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11941 			/*
11942 			 * Compute the expected 16-bit 1's complement sum;
11943 			 * skip this if we've already computed it above
11944 			 * when partial checksum offload is forced.
11945 			 */
11946 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11947 
11948 			/* Hardware or driver is buggy */
11949 			if (sum != m->m_pkthdr.csum_rx_val) {
11950 				hwcksum_dbg_bad_cksum++;
11951 				if (dlil_verbose) {
11952 					DLIL_PRINTF("%s: bad partial cksum value "
11953 					    "0x%x (expected 0x%x) for mbuf "
11954 					    "0x%llx [rx_start %d]\n",
11955 					    if_name(ifp),
11956 					    m->m_pkthdr.csum_rx_val, sum,
11957 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11958 					    m->m_pkthdr.csum_rx_start);
11959 				}
11960 				return;
11961 			}
11962 		}
11963 		hwcksum_dbg_verified++;
11964 
11965 		/*
11966 		 * This code allows us to emulate various hardwares that
11967 		 * perform 16-bit 1's complement sum beginning at various
11968 		 * start offset values.
11969 		 */
11970 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11971 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11972 
11973 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11974 				return;
11975 			}
11976 
11977 			sum = m_adj_sum16(m, rxoff, aoff,
11978 			    m_pktlen(m) - aoff, sum);
11979 
11980 			m->m_pkthdr.csum_rx_val = sum;
11981 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11982 
11983 			hwcksum_dbg_adjusted++;
11984 		}
11985 	}
11986 }
11987 
11988 static int
11989 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11990 {
11991 #pragma unused(arg1, arg2)
11992 	u_int32_t i;
11993 	int err;
11994 
11995 	i = hwcksum_dbg_mode;
11996 
11997 	err = sysctl_handle_int(oidp, &i, 0, req);
11998 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11999 		return err;
12000 	}
12001 
12002 	if (hwcksum_dbg == 0) {
12003 		return ENODEV;
12004 	}
12005 
12006 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12007 		return EINVAL;
12008 	}
12009 
12010 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12011 
12012 	return err;
12013 }
12014 
12015 static int
12016 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12017 {
12018 #pragma unused(arg1, arg2)
12019 	u_int32_t i;
12020 	int err;
12021 
12022 	i = hwcksum_dbg_partial_rxoff_forced;
12023 
12024 	err = sysctl_handle_int(oidp, &i, 0, req);
12025 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12026 		return err;
12027 	}
12028 
12029 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12030 		return ENODEV;
12031 	}
12032 
12033 	hwcksum_dbg_partial_rxoff_forced = i;
12034 
12035 	return err;
12036 }
12037 
12038 static int
12039 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12040 {
12041 #pragma unused(arg1, arg2)
12042 	u_int32_t i;
12043 	int err;
12044 
12045 	i = hwcksum_dbg_partial_rxoff_adj;
12046 
12047 	err = sysctl_handle_int(oidp, &i, 0, req);
12048 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12049 		return err;
12050 	}
12051 
12052 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12053 		return ENODEV;
12054 	}
12055 
12056 	hwcksum_dbg_partial_rxoff_adj = i;
12057 
12058 	return err;
12059 }
12060 
12061 static int
12062 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12063 {
12064 #pragma unused(oidp, arg1, arg2)
12065 	int err;
12066 
12067 	if (req->oldptr == USER_ADDR_NULL) {
12068 	}
12069 	if (req->newptr != USER_ADDR_NULL) {
12070 		return EPERM;
12071 	}
12072 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12073 	    sizeof(struct chain_len_stats));
12074 
12075 	return err;
12076 }
12077 
12078 
12079 #if DEBUG || DEVELOPMENT
12080 /* Blob for sum16 verification */
12081 static uint8_t sumdata[] = {
12082 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12083 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12084 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12085 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12086 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12087 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12088 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12089 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12090 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12091 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12092 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12093 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12094 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12095 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12096 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12097 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12098 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12099 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12100 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12101 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12102 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12103 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12104 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12105 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12106 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12107 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12108 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12109 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12110 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12111 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12112 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12113 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12114 	0xc8, 0x28, 0x02, 0x00, 0x00
12115 };
12116 
12117 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12118 static struct {
12119 	boolean_t       init;
12120 	uint16_t        len;
12121 	uint16_t        sumr;   /* reference */
12122 	uint16_t        sumrp;  /* reference, precomputed */
12123 } sumtbl[] = {
12124 	{ FALSE, 0, 0, 0x0000 },
12125 	{ FALSE, 1, 0, 0x001f },
12126 	{ FALSE, 2, 0, 0x8b1f },
12127 	{ FALSE, 3, 0, 0x8b27 },
12128 	{ FALSE, 7, 0, 0x790e },
12129 	{ FALSE, 11, 0, 0xcb6d },
12130 	{ FALSE, 20, 0, 0x20dd },
12131 	{ FALSE, 27, 0, 0xbabd },
12132 	{ FALSE, 32, 0, 0xf3e8 },
12133 	{ FALSE, 37, 0, 0x197d },
12134 	{ FALSE, 43, 0, 0x9eae },
12135 	{ FALSE, 64, 0, 0x4678 },
12136 	{ FALSE, 127, 0, 0x9399 },
12137 	{ FALSE, 256, 0, 0xd147 },
12138 	{ FALSE, 325, 0, 0x0358 },
12139 };
12140 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12141 
12142 static void
dlil_verify_sum16(void)12143 dlil_verify_sum16(void)
12144 {
12145 	struct mbuf *m;
12146 	uint8_t *buf;
12147 	int n;
12148 
12149 	/* Make sure test data plus extra room for alignment fits in cluster */
12150 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12151 
12152 	kprintf("DLIL: running SUM16 self-tests ... ");
12153 
12154 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12155 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12156 
12157 	buf = mtod(m, uint8_t *);               /* base address */
12158 
12159 	for (n = 0; n < SUMTBL_MAX; n++) {
12160 		uint16_t len = sumtbl[n].len;
12161 		int i;
12162 
12163 		/* Verify for all possible alignments */
12164 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12165 			uint16_t sum, sumr;
12166 			uint8_t *c;
12167 
12168 			/* Copy over test data to mbuf */
12169 			VERIFY(len <= sizeof(sumdata));
12170 			c = buf + i;
12171 			bcopy(sumdata, c, len);
12172 
12173 			/* Zero-offset test (align by data pointer) */
12174 			m->m_data = (caddr_t)c;
12175 			m->m_len = len;
12176 			sum = m_sum16(m, 0, len);
12177 
12178 			if (!sumtbl[n].init) {
12179 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12180 				sumtbl[n].sumr = sumr;
12181 				sumtbl[n].init = TRUE;
12182 			} else {
12183 				sumr = sumtbl[n].sumr;
12184 			}
12185 
12186 			/* Something is horribly broken; stop now */
12187 			if (sumr != sumtbl[n].sumrp) {
12188 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12189 				    "for len=%d align=%d sum=0x%04x "
12190 				    "[expected=0x%04x]\n", __func__,
12191 				    len, i, sum, sumr);
12192 				/* NOTREACHED */
12193 			} else if (sum != sumr) {
12194 				panic_plain("\n%s: broken m_sum16() for len=%d "
12195 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12196 				    __func__, len, i, sum, sumr);
12197 				/* NOTREACHED */
12198 			}
12199 
12200 			/* Alignment test by offset (fixed data pointer) */
12201 			m->m_data = (caddr_t)buf;
12202 			m->m_len = i + len;
12203 			sum = m_sum16(m, i, len);
12204 
12205 			/* Something is horribly broken; stop now */
12206 			if (sum != sumr) {
12207 				panic_plain("\n%s: broken m_sum16() for len=%d "
12208 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12209 				    __func__, len, i, sum, sumr);
12210 				/* NOTREACHED */
12211 			}
12212 #if INET
12213 			/* Simple sum16 contiguous buffer test by aligment */
12214 			sum = b_sum16(c, len);
12215 
12216 			/* Something is horribly broken; stop now */
12217 			if (sum != sumr) {
12218 				panic_plain("\n%s: broken b_sum16() for len=%d "
12219 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12220 				    __func__, len, i, sum, sumr);
12221 				/* NOTREACHED */
12222 			}
12223 #endif /* INET */
12224 		}
12225 	}
12226 	m_freem(m);
12227 
12228 	kprintf("PASSED\n");
12229 }
12230 #endif /* DEBUG || DEVELOPMENT */
12231 
12232 #define CASE_STRINGIFY(x) case x: return #x
12233 
12234 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12235 dlil_kev_dl_code_str(u_int32_t event_code)
12236 {
12237 	switch (event_code) {
12238 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12239 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12240 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12241 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12242 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12243 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12244 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12245 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12246 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12247 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12248 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12249 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12250 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12251 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12252 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12253 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12254 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12255 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12256 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12257 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12258 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12259 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12260 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12261 		CASE_STRINGIFY(KEV_DL_ISSUES);
12262 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12263 	default:
12264 		break;
12265 	}
12266 	return "";
12267 }
12268 
12269 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12270 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12271 {
12272 #pragma unused(arg1)
12273 	struct ifnet *ifp = arg0;
12274 
12275 	if (ifnet_is_attached(ifp, 1)) {
12276 		nstat_ifnet_threshold_reached(ifp->if_index);
12277 		ifnet_decr_iorefcnt(ifp);
12278 	}
12279 }
12280 
12281 void
ifnet_notify_data_threshold(struct ifnet * ifp)12282 ifnet_notify_data_threshold(struct ifnet *ifp)
12283 {
12284 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12285 	uint64_t oldbytes = ifp->if_dt_bytes;
12286 
12287 	ASSERT(ifp->if_dt_tcall != NULL);
12288 
12289 	/*
12290 	 * If we went over the threshold, notify NetworkStatistics.
12291 	 * We rate-limit it based on the threshold interval value.
12292 	 */
12293 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12294 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12295 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12296 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12297 		uint64_t now = mach_absolute_time(), deadline = now;
12298 		uint64_t ival;
12299 
12300 		if (tival != 0) {
12301 			nanoseconds_to_absolutetime(tival, &ival);
12302 			clock_deadline_for_periodic_event(ival, now, &deadline);
12303 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12304 			    deadline);
12305 		} else {
12306 			(void) thread_call_enter(ifp->if_dt_tcall);
12307 		}
12308 	}
12309 }
12310 
12311 #if (DEVELOPMENT || DEBUG)
12312 /*
12313  * The sysctl variable name contains the input parameters of
12314  * ifnet_get_keepalive_offload_frames()
12315  *  ifp (interface index): name[0]
12316  *  frames_array_count:    name[1]
12317  *  frame_data_offset:     name[2]
12318  * The return length gives used_frames_count
12319  */
12320 static int
12321 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12322 {
12323 #pragma unused(oidp)
12324 	int *name = (int *)arg1;
12325 	u_int namelen = arg2;
12326 	int idx;
12327 	ifnet_t ifp = NULL;
12328 	u_int32_t frames_array_count;
12329 	size_t frame_data_offset;
12330 	u_int32_t used_frames_count;
12331 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12332 	int error = 0;
12333 	u_int32_t i;
12334 
12335 	/*
12336 	 * Only root can get look at other people TCP frames
12337 	 */
12338 	error = proc_suser(current_proc());
12339 	if (error != 0) {
12340 		goto done;
12341 	}
12342 	/*
12343 	 * Validate the input parameters
12344 	 */
12345 	if (req->newptr != USER_ADDR_NULL) {
12346 		error = EPERM;
12347 		goto done;
12348 	}
12349 	if (namelen != 3) {
12350 		error = EINVAL;
12351 		goto done;
12352 	}
12353 	if (req->oldptr == USER_ADDR_NULL) {
12354 		error = EINVAL;
12355 		goto done;
12356 	}
12357 	if (req->oldlen == 0) {
12358 		error = EINVAL;
12359 		goto done;
12360 	}
12361 	idx = name[0];
12362 	frames_array_count = name[1];
12363 	frame_data_offset = name[2];
12364 
12365 	/* Make sure the passed buffer is large enough */
12366 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12367 	    req->oldlen) {
12368 		error = ENOMEM;
12369 		goto done;
12370 	}
12371 
12372 	ifnet_head_lock_shared();
12373 	if (!IF_INDEX_IN_RANGE(idx)) {
12374 		ifnet_head_done();
12375 		error = ENOENT;
12376 		goto done;
12377 	}
12378 	ifp = ifindex2ifnet[idx];
12379 	ifnet_head_done();
12380 
12381 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12382 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12383 		Z_WAITOK);
12384 	if (frames_array == NULL) {
12385 		error = ENOMEM;
12386 		goto done;
12387 	}
12388 
12389 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12390 	    frames_array_count, frame_data_offset, &used_frames_count);
12391 	if (error != 0) {
12392 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12393 		    __func__, error);
12394 		goto done;
12395 	}
12396 
12397 	for (i = 0; i < used_frames_count; i++) {
12398 		error = SYSCTL_OUT(req, frames_array + i,
12399 		    sizeof(struct ifnet_keepalive_offload_frame));
12400 		if (error != 0) {
12401 			goto done;
12402 		}
12403 	}
12404 done:
12405 	if (frames_array != NULL) {
12406 		kfree_data(frames_array, frames_array_count *
12407 		    sizeof(struct ifnet_keepalive_offload_frame));
12408 	}
12409 	return error;
12410 }
12411 #endif /* DEVELOPMENT || DEBUG */
12412 
12413 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12414 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12415     struct ifnet *ifp)
12416 {
12417 	tcp_update_stats_per_flow(ifs, ifp);
12418 }
12419 
12420 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12421 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12422 {
12423 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12424 }
12425 
12426 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12427 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12428 {
12429 	OSBitAndAtomic(~clear_flags, flags_p);
12430 }
12431 
12432 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12433 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12434 {
12435 	return _set_flags(&interface->if_eflags, set_flags);
12436 }
12437 
12438 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12439 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12440 {
12441 	_clear_flags(&interface->if_eflags, clear_flags);
12442 }
12443 
12444 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12445 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12446 {
12447 	return _set_flags(&interface->if_xflags, set_flags);
12448 }
12449 
12450 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12451 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12452 {
12453 	_clear_flags(&interface->if_xflags, clear_flags);
12454 }
12455 
12456 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12457 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12458 {
12459 	atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12460 }
12461 
12462 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12463 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12464 {
12465 	if (*genid != ifp->if_traffic_rule_genid) {
12466 		*genid = ifp->if_traffic_rule_genid;
12467 		return TRUE;
12468 	}
12469 	return FALSE;
12470 }
12471 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12472 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12473 {
12474 	atomic_set_32(&ifp->if_traffic_rule_count, count);
12475 	ifnet_update_traffic_rule_genid(ifp);
12476 }
12477 
12478 static void
log_hexdump(void * data,size_t len)12479 log_hexdump(void *data, size_t len)
12480 {
12481 	size_t i, j, k;
12482 	unsigned char *ptr = (unsigned char *)data;
12483 #define MAX_DUMP_BUF 32
12484 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12485 
12486 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12487 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12488 			unsigned char msnbl = ptr[j] >> 4;
12489 			unsigned char lsnbl = ptr[j] & 0x0f;
12490 
12491 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12492 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12493 
12494 			if ((j % 2) == 1) {
12495 				buf[k++] = ' ';
12496 			}
12497 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12498 				buf[k++] = ' ';
12499 			}
12500 		}
12501 		buf[k] = 0;
12502 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12503 	}
12504 }
12505 
12506 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12507 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12508 net_check_compatible_if_filter(struct ifnet *ifp)
12509 {
12510 	if (ifp == NULL) {
12511 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12512 			return false;
12513 		}
12514 	} else {
12515 		if (ifp->if_flt_non_os_count > 0) {
12516 			return false;
12517 		}
12518 	}
12519 	return true;
12520 }
12521 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12522 
12523 #define DUMP_BUF_CHK() {        \
12524 	clen -= k;              \
12525 	if (clen < 1)           \
12526 	        goto done;      \
12527 	c += k;                 \
12528 }
12529 
12530 int dlil_dump_top_if_qlen(char *, int);
12531 int
dlil_dump_top_if_qlen(char * str,int str_len)12532 dlil_dump_top_if_qlen(char *str, int str_len)
12533 {
12534 	char *c = str;
12535 	int k, clen = str_len;
12536 	struct ifnet *top_ifcq_ifp = NULL;
12537 	uint32_t top_ifcq_len = 0;
12538 	struct ifnet *top_inq_ifp = NULL;
12539 	uint32_t top_inq_len = 0;
12540 
12541 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12542 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12543 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12544 
12545 		if (ifp == NULL) {
12546 			continue;
12547 		}
12548 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12549 			top_ifcq_len = ifp->if_snd->ifcq_len;
12550 			top_ifcq_ifp = ifp;
12551 		}
12552 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12553 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12554 			top_inq_ifp = ifp;
12555 		}
12556 	}
12557 
12558 	if (top_ifcq_ifp != NULL) {
12559 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12560 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12561 		DUMP_BUF_CHK();
12562 	}
12563 	if (top_inq_ifp != NULL) {
12564 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12565 		    top_inq_len, top_inq_ifp->if_xname);
12566 		DUMP_BUF_CHK();
12567 	}
12568 done:
12569 	return str_len - clen;
12570 }
12571 
12572 #if DEVELOPMENT || DEBUG
12573 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12574 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12575 {
12576 	struct flow_key key = {};
12577 	int error = 0;
12578 
12579 	if (req->newptr == USER_ADDR_NULL) {
12580 		return EINVAL;
12581 	}
12582 	if (req->newlen < sizeof(struct flow_key)) {
12583 		return EINVAL;
12584 	}
12585 	error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12586 	if (error != 0) {
12587 		return error;
12588 	}
12589 
12590 	switch (key.fk_ipver) {
12591 	case IPVERSION:
12592 		if (key.fk_proto != IPPROTO_UDP ||
12593 		    key.fk_sport == 0 || key.fk_dport == 0) {
12594 			return EINVAL;
12595 		}
12596 
12597 		if (key.fk_src4.s_addr == INADDR_ANY ||
12598 		    key.fk_dst4.s_addr == INADDR_ANY) {
12599 			return EINVAL;
12600 		}
12601 
12602 		break;
12603 	case IPV6_VERSION:
12604 		if (key.fk_proto != IPPROTO_UDP ||
12605 		    key.fk_sport == 0 || key.fk_dport == 0) {
12606 			return EINVAL;
12607 		}
12608 
12609 		if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12610 		    IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12611 			return EINVAL;
12612 		}
12613 
12614 		break;
12615 	case 0:
12616 		if (key.fk_proto != 0 ||
12617 		    key.fk_sport != 0 || key.fk_dport != 0) {
12618 			return EINVAL;
12619 		}
12620 
12621 		if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12622 		    !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12623 			return EINVAL;
12624 		}
12625 
12626 		break;
12627 	default:
12628 		return EINVAL;
12629 	}
12630 
12631 	memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12632 	return 0;
12633 }
12634 #endif /* DEVELOPMENT || DEBUG */
12635