xref: /xnu-10002.81.5/bsd/net/dlil.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63 
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70 
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR        4 /* LONGWORDS */
154 
155 #if 1
156 #define DLIL_PRINTF     printf
157 #else
158 #define DLIL_PRINTF     kprintf
159 #endif
160 
161 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
162 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
163 
164 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
165 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
166 
167 enum {
168 	kProtoKPI_v1    = 1,
169 	kProtoKPI_v2    = 2
170 };
171 
172 uint64_t if_creation_generation_count = 0;
173 
174 /*
175  * List of if_proto structures in if_proto_hash[] is protected by
176  * the ifnet lock.  The rest of the fields are initialized at protocol
177  * attach time and never change, thus no lock required as long as
178  * a reference to it is valid, via if_proto_ref().
179  */
180 struct if_proto {
181 	SLIST_ENTRY(if_proto)       next_hash;
182 	u_int32_t                   refcount;
183 	u_int32_t                   detached;
184 	struct ifnet                *ifp;
185 	protocol_family_t           protocol_family;
186 	int                         proto_kpi;
187 	union {
188 		struct {
189 			proto_media_input               input;
190 			proto_media_preout              pre_output;
191 			proto_media_event               event;
192 			proto_media_ioctl               ioctl;
193 			proto_media_detached            detached;
194 			proto_media_resolve_multi       resolve_multi;
195 			proto_media_send_arp            send_arp;
196 		} v1;
197 		struct {
198 			proto_media_input_v2            input;
199 			proto_media_preout              pre_output;
200 			proto_media_event               event;
201 			proto_media_ioctl               ioctl;
202 			proto_media_detached            detached;
203 			proto_media_resolve_multi       resolve_multi;
204 			proto_media_send_arp            send_arp;
205 		} v2;
206 	} kpi;
207 };
208 
209 SLIST_HEAD(proto_hash_entry, if_proto);
210 
211 #define DLIL_SDLDATALEN \
212 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
213 
214 struct dlil_ifnet {
215 	struct ifnet    dl_if;                  /* public ifnet */
216 	/*
217 	 * DLIL private fields, protected by dl_if_lock
218 	 */
219 	decl_lck_mtx_data(, dl_if_lock);
220 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
221 	u_int32_t dl_if_flags;                  /* flags (below) */
222 	u_int32_t dl_if_refcnt;                 /* refcnt */
223 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
224 	void    *dl_if_uniqueid;                /* unique interface id */
225 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
226 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
227 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
228 	struct {
229 		struct ifaddr   ifa;            /* lladdr ifa */
230 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
231 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
232 	} dl_if_lladdr;
233 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
234 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
235 	u_int8_t dl_if_permanent_ether_is_set;
236 	u_int8_t dl_if_unused;
237 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
238 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
239 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
240 };
241 
242 /* Values for dl_if_flags (private to DLIL) */
243 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
244 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
245 #define DLIF_DEBUG      0x4     /* has debugging info */
246 
247 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
248 
249 /* For gdb */
250 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
251 
252 struct dlil_ifnet_dbg {
253 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
254 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
255 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
256 	/*
257 	 * Circular lists of ifnet_{reference,release} callers.
258 	 */
259 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
260 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
261 };
262 
263 #define DLIL_TO_IFP(s)  (&s->dl_if)
264 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
265 
266 struct ifnet_filter {
267 	TAILQ_ENTRY(ifnet_filter)       filt_next;
268 	u_int32_t                       filt_skip;
269 	u_int32_t                       filt_flags;
270 	ifnet_t                         filt_ifp;
271 	const char                      *filt_name;
272 	void                            *filt_cookie;
273 	protocol_family_t               filt_protocol;
274 	iff_input_func                  filt_input;
275 	iff_output_func                 filt_output;
276 	iff_event_func                  filt_event;
277 	iff_ioctl_func                  filt_ioctl;
278 	iff_detached_func               filt_detached;
279 };
280 
281 /* Mbuf queue used for freeing the excessive mbufs */
282 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
283 
284 struct proto_input_entry;
285 
286 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
287 
288 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
289 
290 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
291 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
292 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
293 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
294 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
295 
296 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
297 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
298     &dlil_lck_attributes);
299 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
300     &dlil_lck_attributes);
301 
302 #if DEBUG
303 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
304 #else
305 static unsigned int ifnet_debug;        /* debugging (disabled) */
306 #endif /* !DEBUG */
307 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
308 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
309 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
310 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
311 
312 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
313 
314 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
315 
316 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
317 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
318 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
319 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
320 
321 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
322 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
323 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
324 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
325 
326 static u_int32_t net_rtref;
327 
328 static struct dlil_main_threading_info dlil_main_input_thread_info;
329 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
330     (struct dlil_threading_info *)&dlil_main_input_thread_info;
331 
332 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
333 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
334 static void dlil_if_trace(struct dlil_ifnet *, int);
335 static void if_proto_ref(struct if_proto *);
336 static void if_proto_free(struct if_proto *);
337 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
338 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
339     u_int32_t list_count);
340 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
341 static void if_flt_monitor_busy(struct ifnet *);
342 static void if_flt_monitor_unbusy(struct ifnet *);
343 static void if_flt_monitor_enter(struct ifnet *);
344 static void if_flt_monitor_leave(struct ifnet *);
345 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
346     char **, protocol_family_t);
347 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
348     protocol_family_t);
349 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
350     const struct sockaddr_dl *);
351 static int ifnet_lookup(struct ifnet *);
352 static void if_purgeaddrs(struct ifnet *);
353 
354 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
355     struct mbuf *, char *);
356 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
357     struct mbuf *);
358 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
359     mbuf_t *, const struct sockaddr *, void *, char *, char *);
360 static void ifproto_media_event(struct ifnet *, protocol_family_t,
361     const struct kev_msg *);
362 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
363     unsigned long, void *);
364 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
365     struct sockaddr_dl *, size_t);
366 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
367     const struct sockaddr_dl *, const struct sockaddr *,
368     const struct sockaddr_dl *, const struct sockaddr *);
369 
370 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
371     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
372     boolean_t poll, struct thread *tp);
373 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
374     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
375 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
376 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
377     protocol_family_t *);
378 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
379     const struct ifnet_demux_desc *, u_int32_t);
380 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
381 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
382 #if !XNU_TARGET_OS_OSX
383 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
384     const struct sockaddr *, const char *, const char *,
385     u_int32_t *, u_int32_t *);
386 #else /* XNU_TARGET_OS_OSX */
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388     const struct sockaddr *, const char *, const char *);
389 #endif /* XNU_TARGET_OS_OSX */
390 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
391     const struct sockaddr *, const char *, const char *,
392     u_int32_t *, u_int32_t *);
393 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
394 static void ifp_if_free(struct ifnet *);
395 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
396 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
397 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
398 
399 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
400     dlil_freeq_t *, struct ifnet_stat_increment_param *);
401 
402 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
403     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
404     boolean_t, struct thread *);
405 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
406     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
407     boolean_t, struct thread *);
408 
409 static void dlil_main_input_thread_func(void *, wait_result_t);
410 static void dlil_main_input_thread_cont(void *, wait_result_t);
411 
412 static void dlil_input_thread_func(void *, wait_result_t);
413 static void dlil_input_thread_cont(void *, wait_result_t);
414 
415 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
416 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
417 
418 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
419     thread_continue_t *);
420 static void dlil_terminate_input_thread(struct dlil_threading_info *);
421 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
422     struct dlil_threading_info *, struct ifnet *, boolean_t);
423 static boolean_t dlil_input_stats_sync(struct ifnet *,
424     struct dlil_threading_info *);
425 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
426     u_int32_t, ifnet_model_t, boolean_t);
427 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
428     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
429 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
430 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
431 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
432 #if DEBUG || DEVELOPMENT
433 static void dlil_verify_sum16(void);
434 #endif /* DEBUG || DEVELOPMENT */
435 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
436     protocol_family_t);
437 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
438     protocol_family_t);
439 
440 static void dlil_incr_pending_thread_count(void);
441 static void dlil_decr_pending_thread_count(void);
442 
443 static void ifnet_detacher_thread_func(void *, wait_result_t);
444 static void ifnet_detacher_thread_cont(void *, wait_result_t);
445 static void ifnet_detach_final(struct ifnet *);
446 static void ifnet_detaching_enqueue(struct ifnet *);
447 static struct ifnet *ifnet_detaching_dequeue(void);
448 
449 static void ifnet_start_thread_func(void *, wait_result_t);
450 static void ifnet_start_thread_cont(void *, wait_result_t);
451 
452 static void ifnet_poll_thread_func(void *, wait_result_t);
453 static void ifnet_poll_thread_cont(void *, wait_result_t);
454 
455 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
456     classq_pkt_t *, boolean_t, boolean_t *);
457 
458 static void ifp_src_route_copyout(struct ifnet *, struct route *);
459 static void ifp_src_route_copyin(struct ifnet *, struct route *);
460 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
461 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
462 
463 static errno_t if_mcasts_update_async(struct ifnet *);
464 
465 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
471 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
475 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
478 
479 struct chain_len_stats tx_chain_len_stats;
480 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
481 
482 #if TEST_INPUT_THREAD_TERMINATION
483 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
484 #endif /* TEST_INPUT_THREAD_TERMINATION */
485 
486 /* The following are protected by dlil_ifnet_lock */
487 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
488 static u_int32_t ifnet_detaching_cnt;
489 static boolean_t ifnet_detaching_embryonic;
490 static void *ifnet_delayed_run; /* wait channel for detaching thread */
491 
492 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
493     &dlil_lck_attributes);
494 
495 static uint32_t ifnet_flowhash_seed;
496 
497 struct ifnet_flowhash_key {
498 	char            ifk_name[IFNAMSIZ];
499 	uint32_t        ifk_unit;
500 	uint32_t        ifk_flags;
501 	uint32_t        ifk_eflags;
502 	uint32_t        ifk_capabilities;
503 	uint32_t        ifk_capenable;
504 	uint32_t        ifk_output_sched_model;
505 	uint32_t        ifk_rand1;
506 	uint32_t        ifk_rand2;
507 };
508 
509 /* Flow control entry per interface */
510 struct ifnet_fc_entry {
511 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
512 	u_int32_t       ifce_flowhash;
513 	struct ifnet    *ifce_ifp;
514 };
515 
516 static uint32_t ifnet_calc_flowhash(struct ifnet *);
517 static int ifce_cmp(const struct ifnet_fc_entry *,
518     const struct ifnet_fc_entry *);
519 static int ifnet_fc_add(struct ifnet *);
520 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
521 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
522 
523 /* protected by ifnet_fc_lock */
524 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
525 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
526 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527 
528 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
529 
530 extern void bpfdetach(struct ifnet *);
531 extern void proto_input_run(void);
532 
533 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
534     u_int32_t flags);
535 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
536     u_int32_t flags);
537 
538 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
539 
540 #if CONFIG_MACF
541 #if !XNU_TARGET_OS_OSX
542 int dlil_lladdr_ckreq = 1;
543 #else /* XNU_TARGET_OS_OSX */
544 int dlil_lladdr_ckreq = 0;
545 #endif /* XNU_TARGET_OS_OSX */
546 #endif /* CONFIG_MACF */
547 
548 #if DEBUG
549 int dlil_verbose = 1;
550 #else
551 int dlil_verbose = 0;
552 #endif /* DEBUG */
553 #if IFNET_INPUT_SANITY_CHK
554 /* sanity checking of input packet lists received */
555 static u_int32_t dlil_input_sanity_check = 0;
556 #endif /* IFNET_INPUT_SANITY_CHK */
557 /* rate limit debug messages */
558 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
559 
560 SYSCTL_DECL(_net_link_generic_system);
561 
562 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
563     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
564 
565 #define IF_SNDQ_MINLEN  32
566 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
568     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
569     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
570 
571 #define IF_RCVQ_MINLEN  32
572 #define IF_RCVQ_MAXLEN  256
573 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
574 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
575     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
576     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
577 
578 /*
579  * Protect against possible memory starvation that may happen
580  * when the driver is pushing data faster than the AP can process.
581  *
582  * If at any point during DLIL input phase any of the input queues
583  * exceeds the burst limit, DLIL will start to trim the queue,
584  * by returning mbufs in the input queue to the cache from which
585  * the mbufs were originally allocated, starting from the oldest
586  * mbuf and continuing until the new limit (see below) is reached.
587  *
588  * In order to avoid a steplocked equilibrium, the trimming
589  * will continue PAST the burst limit, until the corresponding
590  * input queue is reduced to `if_rcvq_trim_pct' %.
591  *
592  * For example, if the input queue limit is 1024 packets,
593  * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
594  * the trimming will continue until the queue contains 819 packets
595  * (1024 * 80 / 100 == 819).
596  *
597  * Setting the burst limit too low can hurt the throughput,
598  * while setting the burst limit too high can defeat the purpose.
599  */
600 #define IF_RCVQ_BURST_LIMIT_MIN         1024
601 #define IF_RCVQ_BURST_LIMIT_DEFAULT     8192
602 #define IF_RCVQ_BURST_LIMIT_MAX         32768
603 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
604 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
605     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
606     sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
607 
608 #define IF_RCVQ_TRIM_PCT_MIN            20
609 #define IF_RCVQ_TRIM_PCT_DEFAULT        80
610 #define IF_RCVQ_TRIM_PCT_MAX            100
611 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
612 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
613     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
614     sysctl_rcvq_trim_pct, "I",
615     "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
616 
617 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
618 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
619 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
620     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
621     "ilog2 of EWMA decay rate of avg inbound packets");
622 
623 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
624 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
625 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
626 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
627     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
628     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
629     "Q", "input poll mode freeze time");
630 
631 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
632 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
633 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
634 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
635     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
636     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
637     "Q", "input poll sampling time");
638 
639 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
640 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
641     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
642     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
643     "Q", "input poll interval (time)");
644 
645 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
646 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
647 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
648     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
649     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
650 
651 #define IF_RXPOLL_WLOWAT        10
652 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
653 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
654     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
655     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
656     "I", "input poll wakeup low watermark");
657 
658 #define IF_RXPOLL_WHIWAT        100
659 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
660 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
661     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
662     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
663     "I", "input poll wakeup high watermark");
664 
665 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
666 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
667     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
668     "max packets per poll call");
669 
670 u_int32_t if_rxpoll = 1;
671 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
672     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
673     sysctl_rxpoll, "I", "enable opportunistic input polling");
674 
675 #if TEST_INPUT_THREAD_TERMINATION
676 static u_int32_t if_input_thread_termination_spin = 0;
677 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
678     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
679     &if_input_thread_termination_spin, 0,
680     sysctl_input_thread_termination_spin,
681     "I", "input thread termination spin limit");
682 #endif /* TEST_INPUT_THREAD_TERMINATION */
683 
684 static u_int32_t cur_dlil_input_threads = 0;
685 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
686     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
687     "Current number of DLIL input threads");
688 
689 #if IFNET_INPUT_SANITY_CHK
690 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
691     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
692     "Turn on sanity checking in DLIL input");
693 #endif /* IFNET_INPUT_SANITY_CHK */
694 
695 static u_int32_t if_flowadv = 1;
696 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
697     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
698     "enable flow-advisory mechanism");
699 
700 static u_int32_t if_delaybased_queue = 1;
701 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
702     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
703     "enable delay based dynamic queue sizing");
704 
705 static uint64_t hwcksum_in_invalidated = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
708     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
709 
710 uint32_t hwcksum_dbg = 0;
711 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
712     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
713     "enable hardware cksum debugging");
714 
715 u_int32_t ifnet_start_delayed = 0;
716 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
717     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
718     "number of times start was delayed");
719 
720 u_int32_t ifnet_delay_start_disabled = 0;
721 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
722     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
723     "number of times start was delayed");
724 
725 static inline void
ifnet_delay_start_disabled_increment(void)726 ifnet_delay_start_disabled_increment(void)
727 {
728 	OSIncrementAtomic(&ifnet_delay_start_disabled);
729 }
730 
731 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
732 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
733 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
734 #define HWCKSUM_DBG_MASK \
735 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
736 	HWCKSUM_DBG_FINALIZE_FORCED)
737 
738 static uint32_t hwcksum_dbg_mode = 0;
739 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
740     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
741     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
742 
743 static uint64_t hwcksum_dbg_partial_forced = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
746     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
747 
748 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
751     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
752 
753 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
754 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
755     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
756     &hwcksum_dbg_partial_rxoff_forced, 0,
757     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
758     "forced partial cksum rx offset");
759 
760 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
761 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
762     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
763     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
764     "adjusted partial cksum rx offset");
765 
766 static uint64_t hwcksum_dbg_verified = 0;
767 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
768     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
769     &hwcksum_dbg_verified, "packets verified for having good checksum");
770 
771 static uint64_t hwcksum_dbg_bad_cksum = 0;
772 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
773     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
774     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
775 
776 static uint64_t hwcksum_dbg_bad_rxoff = 0;
777 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
778     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
779     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
780 
781 static uint64_t hwcksum_dbg_adjusted = 0;
782 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
783     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
784     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
785 
786 static uint64_t hwcksum_dbg_finalized_hdr = 0;
787 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
789     &hwcksum_dbg_finalized_hdr, "finalized headers");
790 
791 static uint64_t hwcksum_dbg_finalized_data = 0;
792 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
794     &hwcksum_dbg_finalized_data, "finalized payloads");
795 
796 uint32_t hwcksum_tx = 1;
797 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
798     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
799     "enable transmit hardware checksum offload");
800 
801 uint32_t hwcksum_rx = 1;
802 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
803     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
804     "enable receive hardware checksum offload");
805 
806 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
807     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
808     sysctl_tx_chain_len_stats, "S", "");
809 
810 uint32_t tx_chain_len_count = 0;
811 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
812     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
813 
814 static uint32_t threshold_notify = 1;           /* enable/disable */
815 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
816     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
817 
818 static uint32_t threshold_interval = 2;         /* in seconds */
819 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
820     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
821 
822 #if (DEVELOPMENT || DEBUG)
823 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
824 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
825     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
826 #endif /* DEVELOPMENT || DEBUG */
827 
828 struct net_api_stats net_api_stats;
829 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
830     &net_api_stats, net_api_stats, "");
831 
832 uint32_t net_wake_pkt_debug = 0;
833 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
834     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
835 
836 static void log_hexdump(void *data, size_t len);
837 
838 unsigned int net_rxpoll = 1;
839 unsigned int net_affinity = 1;
840 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
841 
842 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
843 
844 extern u_int32_t        inject_buckets;
845 
846 /* DLIL data threshold thread call */
847 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
848 
849 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)850 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
851 {
852 	/*
853 	 * update filter count and route_generation ID to let TCP
854 	 * know it should reevalute doing TSO or not
855 	 */
856 	if (filter_enable) {
857 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
858 	} else {
859 		VERIFY(ifp->if_flt_no_tso_count != 0);
860 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
861 	}
862 	routegenid_update();
863 }
864 
865 #if SKYWALK
866 
867 #if defined(XNU_TARGET_OS_OSX)
868 static bool net_check_compatible_if_filter(struct ifnet *ifp);
869 #endif /* XNU_TARGET_OS_OSX */
870 
871 /* if_attach_nx flags defined in os_skywalk_private.h */
872 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
873 unsigned int if_enable_fsw_ip_netagent =
874     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
875 unsigned int if_enable_fsw_transport_netagent =
876     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
877 
878 unsigned int if_netif_all =
879     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
880 
881 /* Configure flowswitch to use max mtu sized buffer */
882 static bool fsw_use_max_mtu_buffer = false;
883 
884 #if (DEVELOPMENT || DEBUG)
885 static int
886 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
887 {
888 #pragma unused(oidp, arg1, arg2)
889 	unsigned int new_value;
890 	int changed;
891 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
892 	    &new_value, &changed);
893 	if (error) {
894 		return error;
895 	}
896 	if (changed) {
897 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
898 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
899 			return ENOTSUP;
900 		}
901 		if_attach_nx = new_value;
902 	}
903 	return 0;
904 }
905 
906 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
907     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
908     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
909 
910 #endif /* DEVELOPMENT || DEBUG */
911 
912 static int
913 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
914 {
915 #pragma unused(oidp, arg1, arg2)
916 	unsigned int new_value;
917 	int changed;
918 	int error;
919 
920 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
921 	    sizeof(if_enable_fsw_transport_netagent),
922 	    &new_value, &changed);
923 	if (error == 0 && changed != 0) {
924 		if (new_value != 0 && new_value != 1) {
925 			/* only allow 0 or 1 */
926 			error = EINVAL;
927 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
928 			/* netagent can be enabled/disabled */
929 			if_enable_fsw_transport_netagent = new_value;
930 			if (new_value == 0) {
931 				kern_nexus_deregister_netagents();
932 			} else {
933 				kern_nexus_register_netagents();
934 			}
935 		} else {
936 			/* netagent can't be enabled */
937 			error = ENOTSUP;
938 		}
939 	}
940 	return error;
941 }
942 
943 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
944     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
945     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
946     "enable flowswitch netagent");
947 
948 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
949 
950 #include <skywalk/os_skywalk_private.h>
951 
952 boolean_t
ifnet_nx_noauto(ifnet_t ifp)953 ifnet_nx_noauto(ifnet_t ifp)
954 {
955 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
956 }
957 
958 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)959 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
960 {
961 	return ifnet_is_low_latency(ifp);
962 }
963 
964 boolean_t
ifnet_is_low_latency(ifnet_t ifp)965 ifnet_is_low_latency(ifnet_t ifp)
966 {
967 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
968 }
969 
970 boolean_t
ifnet_needs_compat(ifnet_t ifp)971 ifnet_needs_compat(ifnet_t ifp)
972 {
973 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
974 		return FALSE;
975 	}
976 #if !XNU_TARGET_OS_OSX
977 	/*
978 	 * To conserve memory, we plumb in the compat layer selectively; this
979 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
980 	 * In particular, we check for Wi-Fi Access Point.
981 	 */
982 	if (IFNET_IS_WIFI(ifp)) {
983 		/* Wi-Fi Access Point */
984 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
985 		    ifp->if_name[2] == '\0') {
986 			return if_netif_all;
987 		}
988 	}
989 #else /* XNU_TARGET_OS_OSX */
990 #pragma unused(ifp)
991 #endif /* XNU_TARGET_OS_OSX */
992 	return TRUE;
993 }
994 
995 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)996 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
997 {
998 	if (if_is_fsw_transport_netagent_enabled()) {
999 		/* check if netagent has been manually enabled for ipsec/utun */
1000 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1001 			return ipsec_interface_needs_netagent(ifp);
1002 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1003 			return utun_interface_needs_netagent(ifp);
1004 		}
1005 
1006 		/* check ifnet no auto nexus override */
1007 		if (ifnet_nx_noauto(ifp)) {
1008 			return FALSE;
1009 		}
1010 
1011 		/* check global if_attach_nx configuration */
1012 		switch (ifp->if_family) {
1013 		case IFNET_FAMILY_CELLULAR:
1014 		case IFNET_FAMILY_ETHERNET:
1015 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1016 				return TRUE;
1017 			}
1018 			break;
1019 		default:
1020 			break;
1021 		}
1022 	}
1023 	return FALSE;
1024 }
1025 
1026 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1027 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1028 {
1029 #pragma unused(ifp)
1030 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1031 		return TRUE;
1032 	}
1033 	return FALSE;
1034 }
1035 
1036 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1037 ifnet_needs_netif_netagent(ifnet_t ifp)
1038 {
1039 #pragma unused(ifp)
1040 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1041 }
1042 
1043 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1044 dlil_detach_nexus_instance(nexus_controller_t controller,
1045     const char *func_str, uuid_t instance, uuid_t device)
1046 {
1047 	errno_t         err;
1048 
1049 	if (instance == NULL || uuid_is_null(instance)) {
1050 		return FALSE;
1051 	}
1052 
1053 	/* followed by the device port */
1054 	if (device != NULL && !uuid_is_null(device)) {
1055 		err = kern_nexus_ifdetach(controller, instance, device);
1056 		if (err != 0) {
1057 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1058 			    func_str, err);
1059 		}
1060 	}
1061 	err = kern_nexus_controller_free_provider_instance(controller,
1062 	    instance);
1063 	if (err != 0) {
1064 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1065 		    func_str, err);
1066 	}
1067 	return TRUE;
1068 }
1069 
1070 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1071 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1072     uuid_t device)
1073 {
1074 	boolean_t               detached = FALSE;
1075 	nexus_controller_t      controller = kern_nexus_shared_controller();
1076 	int                     err;
1077 
1078 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1079 	    device)) {
1080 		detached = TRUE;
1081 	}
1082 	if (provider != NULL && !uuid_is_null(provider)) {
1083 		detached = TRUE;
1084 		err = kern_nexus_controller_deregister_provider(controller,
1085 		    provider);
1086 		if (err != 0) {
1087 			DLIL_PRINTF("%s deregister_provider %d\n",
1088 			    func_str, err);
1089 		}
1090 	}
1091 	return detached;
1092 }
1093 
1094 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1095 dlil_create_provider_and_instance(nexus_controller_t controller,
1096     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1097     nexus_attr_t attr)
1098 {
1099 	uuid_t          dom_prov;
1100 	errno_t         err;
1101 	nexus_name_t    provider_name;
1102 	const char      *type_name =
1103 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1104 	struct kern_nexus_init init;
1105 
1106 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1107 	if (err != 0) {
1108 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1109 		    __func__, type_name, err);
1110 		goto failed;
1111 	}
1112 
1113 	snprintf((char *)provider_name, sizeof(provider_name),
1114 	    "com.apple.%s.%s", type_name, if_name(ifp));
1115 	err = kern_nexus_controller_register_provider(controller,
1116 	    dom_prov,
1117 	    provider_name,
1118 	    NULL,
1119 	    0,
1120 	    attr,
1121 	    provider);
1122 	if (err != 0) {
1123 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1124 		    __func__, type_name, err);
1125 		goto failed;
1126 	}
1127 	bzero(&init, sizeof(init));
1128 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1129 	err = kern_nexus_controller_alloc_provider_instance(controller,
1130 	    *provider,
1131 	    NULL, NULL,
1132 	    instance, &init);
1133 	if (err != 0) {
1134 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1135 		    __func__, type_name, err);
1136 		kern_nexus_controller_deregister_provider(controller,
1137 		    *provider);
1138 		goto failed;
1139 	}
1140 failed:
1141 	return err;
1142 }
1143 
1144 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1145 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1146 {
1147 	nexus_attr_t            attr = NULL;
1148 	nexus_controller_t      controller;
1149 	errno_t                 err;
1150 
1151 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1152 		/* it's already attached */
1153 		if (dlil_verbose) {
1154 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1155 			    __func__, if_name(ifp));
1156 			/* already attached */
1157 		}
1158 		goto failed;
1159 	}
1160 
1161 	err = kern_nexus_attr_create(&attr);
1162 	if (err != 0) {
1163 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1164 		    if_name(ifp));
1165 		goto failed;
1166 	}
1167 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1168 	VERIFY(err == 0);
1169 
1170 	controller = kern_nexus_shared_controller();
1171 
1172 	/* create the netif provider and instance */
1173 	err = dlil_create_provider_and_instance(controller,
1174 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1175 	    &netif_nx->if_nif_instance, attr);
1176 	if (err != 0) {
1177 		goto failed;
1178 	}
1179 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1180 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1181 	if (err != 0) {
1182 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1183 		    __func__, err);
1184 		/* cleanup provider and instance */
1185 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1186 		    netif_nx->if_nif_instance, NULL);
1187 		goto failed;
1188 	}
1189 	return TRUE;
1190 
1191 failed:
1192 	if (attr != NULL) {
1193 		kern_nexus_attr_destroy(attr);
1194 	}
1195 	return FALSE;
1196 }
1197 
1198 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1199 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1200 {
1201 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1202 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1203 		goto failed;
1204 	}
1205 	switch (ifp->if_type) {
1206 	case IFT_CELLULAR:
1207 	case IFT_ETHER:
1208 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1209 			/* don't auto-attach */
1210 			goto failed;
1211 		}
1212 		break;
1213 	default:
1214 		/* don't auto-attach */
1215 		goto failed;
1216 	}
1217 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1218 
1219 failed:
1220 	return FALSE;
1221 }
1222 
1223 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1224 dlil_is_native_netif_nexus(ifnet_t ifp)
1225 {
1226 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1227 }
1228 
1229 __attribute__((noinline))
1230 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1231 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1232 {
1233 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1234 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1235 }
1236 
1237 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1238 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1239 {
1240 	struct ifreq        ifr;
1241 	int                 error;
1242 
1243 	bzero(&ifr, sizeof(ifr));
1244 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1245 	if (error == 0) {
1246 		*ifdm_p = ifr.ifr_devmtu;
1247 	}
1248 	return error;
1249 }
1250 
1251 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1252 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1253 {
1254 #ifdef XNU_TARGET_OS_OSX
1255 	uint32_t tso_v4_mtu = 0;
1256 	uint32_t tso_v6_mtu = 0;
1257 
1258 	if (!dlil_is_native_netif_nexus(ifp)) {
1259 		return;
1260 	}
1261 	/*
1262 	 * Note that we are reading the real hwassist flags set by the driver
1263 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1264 	 * hasn't been called yet.
1265 	 */
1266 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1267 		tso_v4_mtu = ifp->if_tso_v4_mtu;
1268 	}
1269 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1270 		tso_v6_mtu = ifp->if_tso_v6_mtu;
1271 	}
1272 	/*
1273 	 * If the hardware supports TSO, adjust the large buf size to match the
1274 	 * supported TSO MTU size.
1275 	 */
1276 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1277 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1278 	} else {
1279 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1280 	}
1281 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1282 #else
1283 #pragma unused(ifp, large_buf_size)
1284 #endif /* XNU_TARGET_OS_OSX */
1285 }
1286 
1287 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1288 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1289     bool *use_multi_buflet, uint32_t *large_buf_size)
1290 {
1291 	struct kern_pbufpool_memory_info rx_pp_info;
1292 	struct kern_pbufpool_memory_info tx_pp_info;
1293 	uint32_t if_max_mtu = 0;
1294 	uint32_t drv_buf_size;
1295 	struct ifdevmtu ifdm;
1296 	int err;
1297 
1298 	/*
1299 	 * To perform intra-stack RX aggregation flowswitch needs to use
1300 	 * multi-buflet packet.
1301 	 */
1302 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1303 
1304 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1305 	/*
1306 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1307 	 * but the driver advertises the MAX MTU as only 9K.
1308 	 */
1309 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1310 		if_max_mtu = IP_MAXPACKET;
1311 		goto skip_mtu_ioctl;
1312 	}
1313 
1314 	/* determine max mtu */
1315 	bzero(&ifdm, sizeof(ifdm));
1316 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1317 	if (__improbable(err != 0)) {
1318 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1319 		    __func__, if_name(ifp));
1320 		/* use default flowswitch buffer size */
1321 		if_max_mtu = NX_FSW_BUFSIZE;
1322 	} else {
1323 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1324 		    ifdm.ifdm_max, ifdm.ifdm_current);
1325 		/* rdar://problem/44589731 */
1326 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1327 	}
1328 
1329 skip_mtu_ioctl:
1330 	if (if_max_mtu == 0) {
1331 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1332 		    __func__, if_name(ifp));
1333 		return EINVAL;
1334 	}
1335 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1336 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1337 		    "max bufsize(%d)\n", __func__,
1338 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1339 		return EINVAL;
1340 	}
1341 
1342 	/*
1343 	 * for skywalk native driver, consult the driver packet pool also.
1344 	 */
1345 	if (dlil_is_native_netif_nexus(ifp)) {
1346 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1347 		    &tx_pp_info);
1348 		if (err != 0) {
1349 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1350 			    __func__, if_name(ifp));
1351 			return ENXIO;
1352 		}
1353 		drv_buf_size = tx_pp_info.kpm_bufsize *
1354 		    tx_pp_info.kpm_max_frags;
1355 		if (if_max_mtu > drv_buf_size) {
1356 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1357 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1358 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1359 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1360 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1361 			return EINVAL;
1362 		}
1363 	} else {
1364 		drv_buf_size = if_max_mtu;
1365 	}
1366 
1367 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1368 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1369 		*use_multi_buflet = true;
1370 		/* default flowswitch buffer size */
1371 		*buf_size = NX_FSW_BUFSIZE;
1372 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1373 	} else {
1374 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1375 	}
1376 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1377 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1378 	if (*buf_size >= *large_buf_size) {
1379 		*large_buf_size = 0;
1380 	}
1381 	return 0;
1382 }
1383 
1384 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1385 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1386 {
1387 	nexus_attr_t            attr = NULL;
1388 	nexus_controller_t      controller;
1389 	errno_t                 err = 0;
1390 	uuid_t                  netif;
1391 	uint32_t                buf_size = 0;
1392 	uint32_t                large_buf_size = 0;
1393 	bool                    multi_buflet;
1394 
1395 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1396 	    IFNET_IS_VMNET(ifp)) {
1397 		goto failed;
1398 	}
1399 
1400 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1401 		/* not possible to attach (netif native/compat not plumbed) */
1402 		goto failed;
1403 	}
1404 
1405 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1406 		/* don't auto-attach */
1407 		goto failed;
1408 	}
1409 
1410 	/* get the netif instance from the ifp */
1411 	err = kern_nexus_get_netif_instance(ifp, netif);
1412 	if (err != 0) {
1413 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1414 		    if_name(ifp));
1415 		goto failed;
1416 	}
1417 
1418 	err = kern_nexus_attr_create(&attr);
1419 	if (err != 0) {
1420 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1421 		    if_name(ifp));
1422 		goto failed;
1423 	}
1424 
1425 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1426 	    &multi_buflet, &large_buf_size);
1427 	if (err != 0) {
1428 		goto failed;
1429 	}
1430 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1431 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1432 
1433 	/* Configure flowswitch buffer size */
1434 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1435 	VERIFY(err == 0);
1436 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1437 	    large_buf_size);
1438 	VERIFY(err == 0);
1439 
1440 	/*
1441 	 * Configure flowswitch to use super-packet (multi-buflet).
1442 	 */
1443 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1444 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1445 	VERIFY(err == 0);
1446 
1447 	/* create the flowswitch provider and instance */
1448 	controller = kern_nexus_shared_controller();
1449 	err = dlil_create_provider_and_instance(controller,
1450 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1451 	    &nexus_fsw->if_fsw_instance, attr);
1452 	if (err != 0) {
1453 		goto failed;
1454 	}
1455 
1456 	/* attach the device port */
1457 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1458 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1459 	if (err != 0) {
1460 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1461 		    __func__, err, if_name(ifp));
1462 		/* cleanup provider and instance */
1463 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1464 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1465 		goto failed;
1466 	}
1467 	return TRUE;
1468 
1469 failed:
1470 	if (err != 0) {
1471 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1472 		    __func__, if_name(ifp), err);
1473 	} else {
1474 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1475 		    __func__, if_name(ifp));
1476 	}
1477 	if (attr != NULL) {
1478 		kern_nexus_attr_destroy(attr);
1479 	}
1480 	return FALSE;
1481 }
1482 
1483 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1484 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1485 {
1486 	boolean_t               attached;
1487 	if_nexus_flowswitch     nexus_fsw;
1488 
1489 #if (DEVELOPMENT || DEBUG)
1490 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1491 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1492 		return FALSE;
1493 	}
1494 #endif /* (DEVELOPMENT || DEBUG) */
1495 
1496 	/*
1497 	 * flowswitch attachment is not supported for interface using the
1498 	 * legacy model (IFNET_INIT_LEGACY)
1499 	 */
1500 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1501 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1502 		    if_name(ifp));
1503 		return FALSE;
1504 	}
1505 
1506 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1507 		/* it's already attached */
1508 		return FALSE;
1509 	}
1510 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1511 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1512 	if (attached) {
1513 		ifnet_lock_exclusive(ifp);
1514 		if (!IF_FULLY_ATTACHED(ifp)) {
1515 			/* interface is going away */
1516 			attached = FALSE;
1517 		} else {
1518 			ifp->if_nx_flowswitch = nexus_fsw;
1519 		}
1520 		ifnet_lock_done(ifp);
1521 		if (!attached) {
1522 			/* clean up flowswitch nexus */
1523 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1524 		}
1525 	}
1526 	return attached;
1527 }
1528 
1529 __attribute__((noinline))
1530 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1531 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1532 {
1533 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1534 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1535 }
1536 
1537 __attribute__((noinline))
1538 static void
dlil_netif_detach_notify(ifnet_t ifp)1539 dlil_netif_detach_notify(ifnet_t ifp)
1540 {
1541 	ifnet_detach_notify_cb_t notify = NULL;
1542 	void *arg = NULL;
1543 
1544 	ifnet_get_detach_notify(ifp, &notify, &arg);
1545 	if (notify == NULL) {
1546 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1547 		return;
1548 	}
1549 	(*notify)(arg);
1550 }
1551 
1552 __attribute__((noinline))
1553 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1554 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1555 {
1556 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1557 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1558 
1559 	ifnet_datamov_suspend_and_drain(ifp);
1560 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1561 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1562 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1563 		dlil_detach_flowswitch_nexus(nx_fsw);
1564 		bzero(nx_fsw, sizeof(*nx_fsw));
1565 	} else {
1566 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1567 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1568 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1569 	}
1570 
1571 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1572 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1573 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1574 		dlil_detach_netif_nexus(nx_netif);
1575 		bzero(nx_netif, sizeof(*nx_netif));
1576 	} else {
1577 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1578 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1579 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1580 	}
1581 	ifnet_datamov_resume(ifp);
1582 }
1583 
1584 boolean_t
ifnet_add_netagent(ifnet_t ifp)1585 ifnet_add_netagent(ifnet_t ifp)
1586 {
1587 	int     error;
1588 
1589 	error = kern_nexus_interface_add_netagent(ifp);
1590 	os_log(OS_LOG_DEFAULT,
1591 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1592 	    ifp->if_xname, error);
1593 	return error == 0;
1594 }
1595 
1596 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1597 ifnet_remove_netagent(ifnet_t ifp)
1598 {
1599 	int     error;
1600 
1601 	error = kern_nexus_interface_remove_netagent(ifp);
1602 	os_log(OS_LOG_DEFAULT,
1603 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1604 	    ifp->if_xname, error);
1605 	return error == 0;
1606 }
1607 
1608 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1609 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1610 {
1611 	if (!IF_FULLY_ATTACHED(ifp)) {
1612 		return FALSE;
1613 	}
1614 	return dlil_attach_flowswitch_nexus(ifp);
1615 }
1616 
1617 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1618 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1619 {
1620 	if_nexus_flowswitch     nexus_fsw;
1621 
1622 	ifnet_lock_exclusive(ifp);
1623 	nexus_fsw = ifp->if_nx_flowswitch;
1624 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1625 	ifnet_lock_done(ifp);
1626 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1627 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1628 }
1629 
1630 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1631 ifnet_attach_netif_nexus(ifnet_t ifp)
1632 {
1633 	boolean_t       nexus_attached;
1634 	if_nexus_netif  nexus_netif;
1635 
1636 	if (!IF_FULLY_ATTACHED(ifp)) {
1637 		return FALSE;
1638 	}
1639 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1640 	if (nexus_attached) {
1641 		ifnet_lock_exclusive(ifp);
1642 		ifp->if_nx_netif = nexus_netif;
1643 		ifnet_lock_done(ifp);
1644 	}
1645 	return nexus_attached;
1646 }
1647 
1648 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1649 ifnet_detach_netif_nexus(ifnet_t ifp)
1650 {
1651 	if_nexus_netif  nexus_netif;
1652 
1653 	ifnet_lock_exclusive(ifp);
1654 	nexus_netif = ifp->if_nx_netif;
1655 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1656 	ifnet_lock_done(ifp);
1657 
1658 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1659 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1660 }
1661 
1662 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1663 ifnet_attach_native_flowswitch(ifnet_t ifp)
1664 {
1665 	if (!dlil_is_native_netif_nexus(ifp)) {
1666 		/* not a native netif */
1667 		return;
1668 	}
1669 	ifnet_attach_flowswitch_nexus(ifp);
1670 }
1671 
1672 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1673 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1674 {
1675 	lck_mtx_lock(&ifp->if_delegate_lock);
1676 	while (ifp->if_fsw_rx_cb_ref > 0) {
1677 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1678 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1679 		    (PZERO + 1), __FUNCTION__, NULL);
1680 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1681 	}
1682 	ifp->if_fsw_rx_cb = cb;
1683 	ifp->if_fsw_rx_cb_arg = arg;
1684 	lck_mtx_unlock(&ifp->if_delegate_lock);
1685 	return 0;
1686 }
1687 
1688 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1689 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1690 {
1691 	/*
1692 	 * This is for avoiding the unnecessary lock acquire for interfaces
1693 	 * not used by a redirect interface.
1694 	 */
1695 	if (ifp->if_fsw_rx_cb == NULL) {
1696 		return ENOENT;
1697 	}
1698 	lck_mtx_lock(&ifp->if_delegate_lock);
1699 	if (ifp->if_fsw_rx_cb == NULL) {
1700 		lck_mtx_unlock(&ifp->if_delegate_lock);
1701 		return ENOENT;
1702 	}
1703 	*cbp = ifp->if_fsw_rx_cb;
1704 	*argp = ifp->if_fsw_rx_cb_arg;
1705 	ifp->if_fsw_rx_cb_ref++;
1706 	lck_mtx_unlock(&ifp->if_delegate_lock);
1707 	return 0;
1708 }
1709 
1710 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1711 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1712 {
1713 	lck_mtx_lock(&ifp->if_delegate_lock);
1714 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1715 		wakeup(&ifp->if_fsw_rx_cb_ref);
1716 	}
1717 	lck_mtx_unlock(&ifp->if_delegate_lock);
1718 }
1719 
1720 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1721 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1722 {
1723 	lck_mtx_lock(&difp->if_delegate_lock);
1724 	while (difp->if_delegate_parent_ref > 0) {
1725 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1726 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1727 		    (PZERO + 1), __FUNCTION__, NULL);
1728 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1729 	}
1730 	difp->if_delegate_parent = parent;
1731 	lck_mtx_unlock(&difp->if_delegate_lock);
1732 	return 0;
1733 }
1734 
1735 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1736 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1737 {
1738 	lck_mtx_lock(&difp->if_delegate_lock);
1739 	if (difp->if_delegate_parent == NULL) {
1740 		lck_mtx_unlock(&difp->if_delegate_lock);
1741 		return ENOENT;
1742 	}
1743 	*parentp = difp->if_delegate_parent;
1744 	difp->if_delegate_parent_ref++;
1745 	lck_mtx_unlock(&difp->if_delegate_lock);
1746 	return 0;
1747 }
1748 
1749 void
ifnet_release_delegate_parent(ifnet_t difp)1750 ifnet_release_delegate_parent(ifnet_t difp)
1751 {
1752 	lck_mtx_lock(&difp->if_delegate_lock);
1753 	if (--difp->if_delegate_parent_ref == 0) {
1754 		wakeup(&difp->if_delegate_parent_ref);
1755 	}
1756 	lck_mtx_unlock(&difp->if_delegate_lock);
1757 }
1758 
1759 __attribute__((noinline))
1760 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1761 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1762 {
1763 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1764 	ifp->if_detach_notify = notify;
1765 	ifp->if_detach_notify_arg = arg;
1766 }
1767 
1768 __attribute__((noinline))
1769 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1770 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1771 {
1772 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1773 	*notifyp = ifp->if_detach_notify;
1774 	*argp = ifp->if_detach_notify_arg;
1775 }
1776 
1777 __attribute__((noinline))
1778 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1779 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1780 {
1781 	ifnet_lock_exclusive(ifp);
1782 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1783 	ifnet_lock_done(ifp);
1784 }
1785 
1786 __attribute__((noinline))
1787 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1788 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1789 {
1790 	ifnet_lock_exclusive(ifp);
1791 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1792 	ifnet_lock_done(ifp);
1793 }
1794 #endif /* SKYWALK */
1795 
1796 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1797 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1798 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1799 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1800 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1801 	/* NOTREACHED */                                        \
1802 	}                                                               \
1803 }
1804 
1805 #define DLIL_EWMA(old, new, decay) do {                                 \
1806 	u_int32_t _avg;                                                 \
1807 	if ((_avg = (old)) > 0)                                         \
1808 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1809 	else                                                            \
1810 	        _avg = (new);                                           \
1811 	(old) = _avg;                                                   \
1812 } while (0)
1813 
1814 #define MBPS    (1ULL * 1000 * 1000)
1815 #define GBPS    (MBPS * 1000)
1816 
1817 struct rxpoll_time_tbl {
1818 	u_int64_t       speed;          /* downlink speed */
1819 	u_int32_t       plowat;         /* packets low watermark */
1820 	u_int32_t       phiwat;         /* packets high watermark */
1821 	u_int32_t       blowat;         /* bytes low watermark */
1822 	u_int32_t       bhiwat;         /* bytes high watermark */
1823 };
1824 
1825 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1826 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1827 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1828 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1829 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1830 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1831 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1832 };
1833 
1834 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1835     &dlil_lck_attributes);
1836 static uint32_t dlil_pending_thread_cnt = 0;
1837 
1838 static void
dlil_incr_pending_thread_count(void)1839 dlil_incr_pending_thread_count(void)
1840 {
1841 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1842 	lck_mtx_lock(&dlil_thread_sync_lock);
1843 	dlil_pending_thread_cnt++;
1844 	lck_mtx_unlock(&dlil_thread_sync_lock);
1845 }
1846 
1847 static void
dlil_decr_pending_thread_count(void)1848 dlil_decr_pending_thread_count(void)
1849 {
1850 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1851 	lck_mtx_lock(&dlil_thread_sync_lock);
1852 	VERIFY(dlil_pending_thread_cnt > 0);
1853 	dlil_pending_thread_cnt--;
1854 	if (dlil_pending_thread_cnt == 0) {
1855 		wakeup(&dlil_pending_thread_cnt);
1856 	}
1857 	lck_mtx_unlock(&dlil_thread_sync_lock);
1858 }
1859 
1860 int
proto_hash_value(u_int32_t protocol_family)1861 proto_hash_value(u_int32_t protocol_family)
1862 {
1863 	/*
1864 	 * dlil_proto_unplumb_all() depends on the mapping between
1865 	 * the hash bucket index and the protocol family defined
1866 	 * here; future changes must be applied there as well.
1867 	 */
1868 	switch (protocol_family) {
1869 	case PF_INET:
1870 		return 0;
1871 	case PF_INET6:
1872 		return 1;
1873 	case PF_VLAN:
1874 		return 2;
1875 	case PF_UNSPEC:
1876 	default:
1877 		return 3;
1878 	}
1879 }
1880 
1881 /*
1882  * Caller must already be holding ifnet lock.
1883  */
1884 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1885 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1886 {
1887 	struct if_proto *proto = NULL;
1888 	u_int32_t i = proto_hash_value(protocol_family);
1889 
1890 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1891 
1892 	if (ifp->if_proto_hash != NULL) {
1893 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1894 	}
1895 
1896 	while (proto != NULL && proto->protocol_family != protocol_family) {
1897 		proto = SLIST_NEXT(proto, next_hash);
1898 	}
1899 
1900 	if (proto != NULL) {
1901 		if_proto_ref(proto);
1902 	}
1903 
1904 	return proto;
1905 }
1906 
1907 static void
if_proto_ref(struct if_proto * proto)1908 if_proto_ref(struct if_proto *proto)
1909 {
1910 	os_atomic_inc(&proto->refcount, relaxed);
1911 }
1912 
1913 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1914 
1915 static void
if_proto_free(struct if_proto * proto)1916 if_proto_free(struct if_proto *proto)
1917 {
1918 	u_int32_t oldval;
1919 	struct ifnet *ifp = proto->ifp;
1920 	u_int32_t proto_family = proto->protocol_family;
1921 	struct kev_dl_proto_data ev_pr_data;
1922 
1923 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1924 	if (oldval > 1) {
1925 		return;
1926 	}
1927 
1928 	if (proto->proto_kpi == kProtoKPI_v1) {
1929 		if (proto->kpi.v1.detached) {
1930 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1931 		}
1932 	}
1933 	if (proto->proto_kpi == kProtoKPI_v2) {
1934 		if (proto->kpi.v2.detached) {
1935 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1936 		}
1937 	}
1938 
1939 	/*
1940 	 * Cleanup routes that may still be in the routing table for that
1941 	 * interface/protocol pair.
1942 	 */
1943 	if_rtproto_del(ifp, proto_family);
1944 
1945 	ifnet_lock_shared(ifp);
1946 
1947 	/* No more reference on this, protocol must have been detached */
1948 	VERIFY(proto->detached);
1949 
1950 	/*
1951 	 * The reserved field carries the number of protocol still attached
1952 	 * (subject to change)
1953 	 */
1954 	ev_pr_data.proto_family = proto_family;
1955 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1956 
1957 	ifnet_lock_done(ifp);
1958 
1959 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1960 	    (struct net_event_data *)&ev_pr_data,
1961 	    sizeof(struct kev_dl_proto_data), FALSE);
1962 
1963 	if (ev_pr_data.proto_remaining_count == 0) {
1964 		/*
1965 		 * The protocol count has gone to zero, mark the interface down.
1966 		 * This used to be done by configd.KernelEventMonitor, but that
1967 		 * is inherently prone to races (rdar://problem/30810208).
1968 		 */
1969 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1970 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1971 		dlil_post_sifflags_msg(ifp);
1972 	}
1973 
1974 	zfree(dlif_proto_zone, proto);
1975 }
1976 
1977 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1978 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1979 {
1980 #if !MACH_ASSERT
1981 #pragma unused(ifp)
1982 #endif
1983 	unsigned int type = 0;
1984 	int ass = 1;
1985 
1986 	switch (what) {
1987 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1988 		type = LCK_RW_ASSERT_EXCLUSIVE;
1989 		break;
1990 
1991 	case IFNET_LCK_ASSERT_SHARED:
1992 		type = LCK_RW_ASSERT_SHARED;
1993 		break;
1994 
1995 	case IFNET_LCK_ASSERT_OWNED:
1996 		type = LCK_RW_ASSERT_HELD;
1997 		break;
1998 
1999 	case IFNET_LCK_ASSERT_NOTOWNED:
2000 		/* nothing to do here for RW lock; bypass assert */
2001 		ass = 0;
2002 		break;
2003 
2004 	default:
2005 		panic("bad ifnet assert type: %d", what);
2006 		/* NOTREACHED */
2007 	}
2008 	if (ass) {
2009 		LCK_RW_ASSERT(&ifp->if_lock, type);
2010 	}
2011 }
2012 
2013 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2014 ifnet_lock_shared(struct ifnet *ifp)
2015 {
2016 	lck_rw_lock_shared(&ifp->if_lock);
2017 }
2018 
2019 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2020 ifnet_lock_exclusive(struct ifnet *ifp)
2021 {
2022 	lck_rw_lock_exclusive(&ifp->if_lock);
2023 }
2024 
2025 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2026 ifnet_lock_done(struct ifnet *ifp)
2027 {
2028 	lck_rw_done(&ifp->if_lock);
2029 }
2030 
2031 #if INET
2032 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2033 if_inetdata_lock_shared(struct ifnet *ifp)
2034 {
2035 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
2036 }
2037 
2038 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2039 if_inetdata_lock_exclusive(struct ifnet *ifp)
2040 {
2041 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2042 }
2043 
2044 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2045 if_inetdata_lock_done(struct ifnet *ifp)
2046 {
2047 	lck_rw_done(&ifp->if_inetdata_lock);
2048 }
2049 #endif
2050 
2051 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2052 if_inet6data_lock_shared(struct ifnet *ifp)
2053 {
2054 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
2055 }
2056 
2057 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2058 if_inet6data_lock_exclusive(struct ifnet *ifp)
2059 {
2060 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2061 }
2062 
2063 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2064 if_inet6data_lock_done(struct ifnet *ifp)
2065 {
2066 	lck_rw_done(&ifp->if_inet6data_lock);
2067 }
2068 
2069 __private_extern__ void
ifnet_head_lock_shared(void)2070 ifnet_head_lock_shared(void)
2071 {
2072 	lck_rw_lock_shared(&ifnet_head_lock);
2073 }
2074 
2075 __private_extern__ void
ifnet_head_lock_exclusive(void)2076 ifnet_head_lock_exclusive(void)
2077 {
2078 	lck_rw_lock_exclusive(&ifnet_head_lock);
2079 }
2080 
2081 __private_extern__ void
ifnet_head_done(void)2082 ifnet_head_done(void)
2083 {
2084 	lck_rw_done(&ifnet_head_lock);
2085 }
2086 
2087 __private_extern__ void
ifnet_head_assert_exclusive(void)2088 ifnet_head_assert_exclusive(void)
2089 {
2090 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2091 }
2092 
2093 /*
2094  * dlil_ifp_protolist
2095  * - get the list of protocols attached to the interface, or just the number
2096  *   of attached protocols
2097  * - if the number returned is greater than 'list_count', truncation occurred
2098  *
2099  * Note:
2100  * - caller must already be holding ifnet lock.
2101  */
2102 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2103 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2104     u_int32_t list_count)
2105 {
2106 	u_int32_t       count = 0;
2107 	int             i;
2108 
2109 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2110 
2111 	if (ifp->if_proto_hash == NULL) {
2112 		goto done;
2113 	}
2114 
2115 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2116 		struct if_proto *proto;
2117 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2118 			if (list != NULL && count < list_count) {
2119 				list[count] = proto->protocol_family;
2120 			}
2121 			count++;
2122 		}
2123 	}
2124 done:
2125 	return count;
2126 }
2127 
2128 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2129 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2130 {
2131 	ifnet_lock_shared(ifp);
2132 	count = dlil_ifp_protolist(ifp, protolist, count);
2133 	ifnet_lock_done(ifp);
2134 	return count;
2135 }
2136 
2137 __private_extern__ void
if_free_protolist(u_int32_t * list)2138 if_free_protolist(u_int32_t *list)
2139 {
2140 	kfree_data_addr(list);
2141 }
2142 
2143 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2144 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2145     u_int32_t event_code, struct net_event_data *event_data,
2146     u_int32_t event_data_len, boolean_t suppress_generation)
2147 {
2148 	struct net_event_data ev_data;
2149 	struct kev_msg ev_msg;
2150 
2151 	bzero(&ev_msg, sizeof(ev_msg));
2152 	bzero(&ev_data, sizeof(ev_data));
2153 	/*
2154 	 * a net event always starts with a net_event_data structure
2155 	 * but the caller can generate a simple net event or
2156 	 * provide a longer event structure to post
2157 	 */
2158 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
2159 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
2160 	ev_msg.kev_subclass     = event_subclass;
2161 	ev_msg.event_code       = event_code;
2162 
2163 	if (event_data == NULL) {
2164 		event_data = &ev_data;
2165 		event_data_len = sizeof(struct net_event_data);
2166 	}
2167 
2168 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2169 	event_data->if_family = ifp->if_family;
2170 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2171 
2172 	ev_msg.dv[0].data_length = event_data_len;
2173 	ev_msg.dv[0].data_ptr    = event_data;
2174 	ev_msg.dv[1].data_length = 0;
2175 
2176 	bool update_generation = true;
2177 	if (event_subclass == KEV_DL_SUBCLASS) {
2178 		/* Don't update interface generation for frequent link quality and state changes  */
2179 		switch (event_code) {
2180 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2181 		case KEV_DL_RRC_STATE_CHANGED:
2182 		case KEV_DL_PRIMARY_ELECTED:
2183 			update_generation = false;
2184 			break;
2185 		default:
2186 			break;
2187 		}
2188 	}
2189 
2190 	/*
2191 	 * Some events that update generation counts might
2192 	 * want to suppress generation count.
2193 	 * One example is node presence/absence where we still
2194 	 * issue kernel event for the invocation but want to avoid
2195 	 * expensive operation of updating generation which triggers
2196 	 * NECP client updates.
2197 	 */
2198 	if (suppress_generation) {
2199 		update_generation = false;
2200 	}
2201 
2202 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2203 }
2204 
2205 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2206 dlil_alloc_local_stats(struct ifnet *ifp)
2207 {
2208 	int ret = EINVAL;
2209 	void *buf, *base, **pbuf;
2210 
2211 	if (ifp == NULL) {
2212 		goto end;
2213 	}
2214 
2215 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2216 		/* allocate tcpstat_local structure */
2217 		buf = zalloc_flags(dlif_tcpstat_zone,
2218 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2219 
2220 		/* Get the 64-bit aligned base address for this object */
2221 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2222 		    sizeof(u_int64_t));
2223 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2224 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2225 
2226 		/*
2227 		 * Wind back a pointer size from the aligned base and
2228 		 * save the original address so we can free it later.
2229 		 */
2230 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2231 		*pbuf = buf;
2232 		ifp->if_tcp_stat = base;
2233 
2234 		/* allocate udpstat_local structure */
2235 		buf = zalloc_flags(dlif_udpstat_zone,
2236 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2237 
2238 		/* Get the 64-bit aligned base address for this object */
2239 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2240 		    sizeof(u_int64_t));
2241 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2242 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2243 
2244 		/*
2245 		 * Wind back a pointer size from the aligned base and
2246 		 * save the original address so we can free it later.
2247 		 */
2248 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2249 		*pbuf = buf;
2250 		ifp->if_udp_stat = base;
2251 
2252 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2253 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2254 
2255 		ret = 0;
2256 	}
2257 
2258 	if (ifp->if_ipv4_stat == NULL) {
2259 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2260 	}
2261 
2262 	if (ifp->if_ipv6_stat == NULL) {
2263 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2264 	}
2265 end:
2266 	if (ifp != NULL && ret != 0) {
2267 		if (ifp->if_tcp_stat != NULL) {
2268 			pbuf = (void **)
2269 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2270 			zfree(dlif_tcpstat_zone, *pbuf);
2271 			ifp->if_tcp_stat = NULL;
2272 		}
2273 		if (ifp->if_udp_stat != NULL) {
2274 			pbuf = (void **)
2275 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2276 			zfree(dlif_udpstat_zone, *pbuf);
2277 			ifp->if_udp_stat = NULL;
2278 		}
2279 		/* The macro kfree_type sets the passed pointer to NULL */
2280 		if (ifp->if_ipv4_stat != NULL) {
2281 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2282 		}
2283 		if (ifp->if_ipv6_stat != NULL) {
2284 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2285 		}
2286 	}
2287 
2288 	return ret;
2289 }
2290 
2291 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2292 dlil_reset_rxpoll_params(ifnet_t ifp)
2293 {
2294 	ASSERT(ifp != NULL);
2295 	ifnet_set_poll_cycle(ifp, NULL);
2296 	ifp->if_poll_update = 0;
2297 	ifp->if_poll_flags = 0;
2298 	ifp->if_poll_req = 0;
2299 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2300 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2301 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2302 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2303 	net_timerclear(&ifp->if_poll_mode_holdtime);
2304 	net_timerclear(&ifp->if_poll_mode_lasttime);
2305 	net_timerclear(&ifp->if_poll_sample_holdtime);
2306 	net_timerclear(&ifp->if_poll_sample_lasttime);
2307 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2308 }
2309 
2310 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2311 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2312     thread_continue_t *thfunc)
2313 {
2314 	boolean_t dlil_rxpoll_input;
2315 	thread_continue_t func = NULL;
2316 	u_int32_t limit;
2317 	int error = 0;
2318 
2319 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2320 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2321 
2322 	/* default strategy utilizes the DLIL worker thread */
2323 	inp->dlth_strategy = dlil_input_async;
2324 
2325 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2326 	if (ifp == NULL) {
2327 		/*
2328 		 * Main input thread only.
2329 		 */
2330 		func = dlil_main_input_thread_func;
2331 		VERIFY(inp == dlil_main_input_thread);
2332 		(void) strlcat(inp->dlth_name,
2333 		    "main_input", DLIL_THREADNAME_LEN);
2334 	} else if (dlil_rxpoll_input) {
2335 		/*
2336 		 * Legacy (non-netif) hybrid polling.
2337 		 */
2338 		func = dlil_rxpoll_input_thread_func;
2339 		VERIFY(inp != dlil_main_input_thread);
2340 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2341 		    "%s_input_poll", if_name(ifp));
2342 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2343 		/*
2344 		 * Asynchronous strategy.
2345 		 */
2346 		func = dlil_input_thread_func;
2347 		VERIFY(inp != dlil_main_input_thread);
2348 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2349 		    "%s_input", if_name(ifp));
2350 	} else {
2351 		/*
2352 		 * Synchronous strategy if there's a netif below and
2353 		 * the device isn't capable of hybrid polling.
2354 		 */
2355 		ASSERT(func == NULL);
2356 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2357 		VERIFY(inp != dlil_main_input_thread);
2358 		ASSERT(!inp->dlth_affinity);
2359 		inp->dlth_strategy = dlil_input_sync;
2360 	}
2361 	VERIFY(inp->dlth_thread == THREAD_NULL);
2362 
2363 	/* let caller know */
2364 	if (thfunc != NULL) {
2365 		*thfunc = func;
2366 	}
2367 
2368 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2369 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2370 
2371 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2372 
2373 	/*
2374 	 * For interfaces that support opportunistic polling, set the
2375 	 * low and high watermarks for outstanding inbound packets/bytes.
2376 	 * Also define freeze times for transitioning between modes
2377 	 * and updating the average.
2378 	 */
2379 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2380 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2381 		if (ifp->if_xflags & IFXF_LEGACY) {
2382 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2383 		}
2384 	} else {
2385 		/*
2386 		 * For interfaces that don't support opportunistic
2387 		 * polling, set the burst limit to prevent memory exhaustion.
2388 		 * The values of `if_rcvq_burst_limit' are safeguarded
2389 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2390 		 */
2391 		limit = if_rcvq_burst_limit;
2392 	}
2393 
2394 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2395 	if (inp == dlil_main_input_thread) {
2396 		struct dlil_main_threading_info *inpm =
2397 		    (struct dlil_main_threading_info *)inp;
2398 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2399 	}
2400 
2401 	if (func == NULL) {
2402 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2403 		ASSERT(error == 0);
2404 		error = ENODEV;
2405 		goto done;
2406 	}
2407 
2408 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2409 	if (error == KERN_SUCCESS) {
2410 		thread_precedence_policy_data_t info;
2411 		__unused kern_return_t kret;
2412 
2413 		bzero(&info, sizeof(info));
2414 		info.importance = 0;
2415 		kret = thread_policy_set(inp->dlth_thread,
2416 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2417 		    THREAD_PRECEDENCE_POLICY_COUNT);
2418 		ASSERT(kret == KERN_SUCCESS);
2419 		/*
2420 		 * We create an affinity set so that the matching workloop
2421 		 * thread or the starter thread (for loopback) can be
2422 		 * scheduled on the same processor set as the input thread.
2423 		 */
2424 		if (net_affinity) {
2425 			struct thread *tp = inp->dlth_thread;
2426 			u_int32_t tag;
2427 			/*
2428 			 * Randomize to reduce the probability
2429 			 * of affinity tag namespace collision.
2430 			 */
2431 			read_frandom(&tag, sizeof(tag));
2432 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2433 				thread_reference(tp);
2434 				inp->dlth_affinity_tag = tag;
2435 				inp->dlth_affinity = TRUE;
2436 			}
2437 		}
2438 	} else if (inp == dlil_main_input_thread) {
2439 		panic_plain("%s: couldn't create main input thread", __func__);
2440 		/* NOTREACHED */
2441 	} else {
2442 		panic_plain("%s: couldn't create %s input thread", __func__,
2443 		    if_name(ifp));
2444 		/* NOTREACHED */
2445 	}
2446 	OSAddAtomic(1, &cur_dlil_input_threads);
2447 
2448 done:
2449 	return error;
2450 }
2451 
2452 #if TEST_INPUT_THREAD_TERMINATION
2453 static int
2454 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2455 {
2456 #pragma unused(arg1, arg2)
2457 	uint32_t i;
2458 	int err;
2459 
2460 	i = if_input_thread_termination_spin;
2461 
2462 	err = sysctl_handle_int(oidp, &i, 0, req);
2463 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2464 		return err;
2465 	}
2466 
2467 	if (net_rxpoll == 0) {
2468 		return ENXIO;
2469 	}
2470 
2471 	if_input_thread_termination_spin = i;
2472 	return err;
2473 }
2474 #endif /* TEST_INPUT_THREAD_TERMINATION */
2475 
2476 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2477 dlil_clean_threading_info(struct dlil_threading_info *inp)
2478 {
2479 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2480 	lck_grp_free(inp->dlth_lock_grp);
2481 	inp->dlth_lock_grp = NULL;
2482 
2483 	inp->dlth_flags = 0;
2484 	inp->dlth_wtot = 0;
2485 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2486 	inp->dlth_ifp = NULL;
2487 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2488 	qlimit(&inp->dlth_pkts) = 0;
2489 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2490 
2491 	VERIFY(!inp->dlth_affinity);
2492 	inp->dlth_thread = THREAD_NULL;
2493 	inp->dlth_strategy = NULL;
2494 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2495 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2496 	VERIFY(inp->dlth_affinity_tag == 0);
2497 #if IFNET_INPUT_SANITY_CHK
2498 	inp->dlth_pkts_cnt = 0;
2499 #endif /* IFNET_INPUT_SANITY_CHK */
2500 }
2501 
2502 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2503 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2504 {
2505 	struct ifnet *ifp = inp->dlth_ifp;
2506 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2507 
2508 	VERIFY(current_thread() == inp->dlth_thread);
2509 	VERIFY(inp != dlil_main_input_thread);
2510 
2511 	OSAddAtomic(-1, &cur_dlil_input_threads);
2512 
2513 #if TEST_INPUT_THREAD_TERMINATION
2514 	{ /* do something useless that won't get optimized away */
2515 		uint32_t        v = 1;
2516 		for (uint32_t i = 0;
2517 		    i < if_input_thread_termination_spin;
2518 		    i++) {
2519 			v = (i + 1) * v;
2520 		}
2521 		DLIL_PRINTF("the value is %d\n", v);
2522 	}
2523 #endif /* TEST_INPUT_THREAD_TERMINATION */
2524 
2525 	lck_mtx_lock_spin(&inp->dlth_lock);
2526 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2527 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2528 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2529 	wakeup_one((caddr_t)&inp->dlth_flags);
2530 	lck_mtx_unlock(&inp->dlth_lock);
2531 
2532 	/* free up pending packets */
2533 	if (pkt.cp_mbuf != NULL) {
2534 		mbuf_freem_list(pkt.cp_mbuf);
2535 	}
2536 
2537 	/* for the extra refcnt from kernel_thread_start() */
2538 	thread_deallocate(current_thread());
2539 
2540 	if (dlil_verbose) {
2541 		DLIL_PRINTF("%s: input thread terminated\n",
2542 		    if_name(ifp));
2543 	}
2544 
2545 	/* this is the end */
2546 	thread_terminate(current_thread());
2547 	/* NOTREACHED */
2548 }
2549 
2550 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2551 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2552 {
2553 	thread_affinity_policy_data_t policy;
2554 
2555 	bzero(&policy, sizeof(policy));
2556 	policy.affinity_tag = tag;
2557 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2558 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2559 }
2560 
2561 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2562 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2563 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2564     enum net_filter_event_subsystems state)
2565 {
2566 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2567 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2568 		if_enable_fsw_transport_netagent = 1;
2569 	} else {
2570 		if_enable_fsw_transport_netagent = 0;
2571 	}
2572 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2573 		kern_nexus_update_netagents();
2574 	} else if (!if_enable_fsw_transport_netagent) {
2575 		necp_update_all_clients();
2576 	}
2577 }
2578 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2579 
2580 void
dlil_init(void)2581 dlil_init(void)
2582 {
2583 	thread_t thread = THREAD_NULL;
2584 
2585 	/*
2586 	 * The following fields must be 64-bit aligned for atomic operations.
2587 	 */
2588 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2589 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2590 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2591 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2592 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2593 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2594 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2595 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2596 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2597 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2598 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2599 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2600 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2601 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2602 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2603 
2604 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2605 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2606 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2607 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2608 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2609 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2610 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2611 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2612 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2613 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2614 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2615 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2616 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2617 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2618 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2619 
2620 	/*
2621 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2622 	 */
2623 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2624 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2625 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2626 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2627 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2628 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2629 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2630 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2631 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2632 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2633 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2634 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2635 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2636 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2637 
2638 	/*
2639 	 * ... as well as the mbuf checksum flags counterparts.
2640 	 */
2641 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2642 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2643 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2644 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2645 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2646 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2647 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2648 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2649 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2650 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2651 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2652 
2653 	/*
2654 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2655 	 */
2656 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2657 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2658 
2659 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2660 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2661 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2662 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2663 
2664 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2665 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2666 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2667 
2668 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2669 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2670 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2671 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2672 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2673 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2674 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2675 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2676 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2677 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2678 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2679 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2680 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2681 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2682 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2683 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2684 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2685 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2686 
2687 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2688 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2689 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2690 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2691 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2692 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2693 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2694 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2695 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2696 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2697 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2698 
2699 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2700 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2701 
2702 	PE_parse_boot_argn("net_affinity", &net_affinity,
2703 	    sizeof(net_affinity));
2704 
2705 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2706 
2707 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2708 
2709 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2710 
2711 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2712 
2713 	VERIFY(dlil_pending_thread_cnt == 0);
2714 #if SKYWALK
2715 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2716 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2717 	boolean_t enable_fsw_netagent =
2718 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2719 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2720 
2721 	/*
2722 	 * Check the device tree to see if Skywalk netagent has been explicitly
2723 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2724 	 * Note that the property is a 0-length key, and so checking for the
2725 	 * presence itself is enough (no need to check for the actual value of
2726 	 * the retrieved variable.)
2727 	 */
2728 	pe_enable_fsw_transport_netagent =
2729 	    PE_get_default("kern.skywalk_netagent_enable",
2730 	    &pe_enable_fsw_transport_netagent,
2731 	    sizeof(pe_enable_fsw_transport_netagent));
2732 	pe_disable_fsw_transport_netagent =
2733 	    PE_get_default("kern.skywalk_netagent_disable",
2734 	    &pe_disable_fsw_transport_netagent,
2735 	    sizeof(pe_disable_fsw_transport_netagent));
2736 
2737 	/*
2738 	 * These two are mutually exclusive, i.e. they both can be absent,
2739 	 * but only one can be present at a time, and so we assert to make
2740 	 * sure it is correct.
2741 	 */
2742 	VERIFY((!pe_enable_fsw_transport_netagent &&
2743 	    !pe_disable_fsw_transport_netagent) ||
2744 	    (pe_enable_fsw_transport_netagent ^
2745 	    pe_disable_fsw_transport_netagent));
2746 
2747 	if (pe_enable_fsw_transport_netagent) {
2748 		kprintf("SK: netagent is enabled via an override for "
2749 		    "this platform\n");
2750 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2751 	} else if (pe_disable_fsw_transport_netagent) {
2752 		kprintf("SK: netagent is disabled via an override for "
2753 		    "this platform\n");
2754 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2755 	} else {
2756 		kprintf("SK: netagent is %s by default for this platform\n",
2757 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2758 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2759 	}
2760 
2761 	/*
2762 	 * Now see if there's a boot-arg override.
2763 	 */
2764 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2765 	    sizeof(if_attach_nx));
2766 	if_enable_fsw_transport_netagent =
2767 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2768 
2769 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2770 
2771 	if (pe_disable_fsw_transport_netagent &&
2772 	    if_enable_fsw_transport_netagent) {
2773 		kprintf("SK: netagent is force-enabled\n");
2774 	} else if (!pe_disable_fsw_transport_netagent &&
2775 	    !if_enable_fsw_transport_netagent) {
2776 		kprintf("SK: netagent is force-disabled\n");
2777 	}
2778 #ifdef XNU_TARGET_OS_OSX
2779 	if (if_enable_fsw_transport_netagent) {
2780 		net_filter_event_register(dlil_filter_event);
2781 	}
2782 #endif /* XNU_TARGET_OS_OSX */
2783 
2784 #if (DEVELOPMENT || DEBUG)
2785 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2786 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2787 #endif /* (DEVELOPMENT || DEBUG) */
2788 
2789 #endif /* SKYWALK */
2790 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2791 	    sizeof(struct dlil_ifnet_dbg);
2792 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2793 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2794 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2795 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2796 
2797 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2798 	/* Enforce 64-bit alignment for tcpstat_local structure */
2799 	dlif_tcpstat_bufsize =
2800 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2801 	dlif_tcpstat_bufsize = (uint32_t)
2802 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2803 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2804 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2805 
2806 	dlif_udpstat_size = sizeof(struct udpstat_local);
2807 	/* Enforce 64-bit alignment for udpstat_local structure */
2808 	dlif_udpstat_bufsize =
2809 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2810 	dlif_udpstat_bufsize = (uint32_t)
2811 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2812 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2813 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2814 
2815 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2816 
2817 	TAILQ_INIT(&dlil_ifnet_head);
2818 	TAILQ_INIT(&ifnet_head);
2819 	TAILQ_INIT(&ifnet_detaching_head);
2820 	TAILQ_INIT(&ifnet_ordered_head);
2821 
2822 	/* Initialize interface address subsystem */
2823 	ifa_init();
2824 
2825 #if PF
2826 	/* Initialize the packet filter */
2827 	pfinit();
2828 #endif /* PF */
2829 
2830 	/* Initialize queue algorithms */
2831 	classq_init();
2832 
2833 	/* Initialize packet schedulers */
2834 	pktsched_init();
2835 
2836 	/* Initialize flow advisory subsystem */
2837 	flowadv_init();
2838 
2839 	/* Initialize the pktap virtual interface */
2840 	pktap_init();
2841 
2842 	/* Initialize the service class to dscp map */
2843 	net_qos_map_init();
2844 
2845 	/* Initialize the interface low power mode event handler */
2846 	if_low_power_evhdlr_init();
2847 
2848 	/* Initialize the interface offload port list subsystem */
2849 	if_ports_used_init();
2850 
2851 #if DEBUG || DEVELOPMENT
2852 	/* Run self-tests */
2853 	dlil_verify_sum16();
2854 #endif /* DEBUG || DEVELOPMENT */
2855 
2856 	/*
2857 	 * Create and start up the main DLIL input thread and the interface
2858 	 * detacher threads once everything is initialized.
2859 	 */
2860 	dlil_incr_pending_thread_count();
2861 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2862 
2863 	/*
2864 	 * Create ifnet detacher thread.
2865 	 * When an interface gets detached, part of the detach processing
2866 	 * is delayed. The interface is added to delayed detach list
2867 	 * and this thread is woken up to call ifnet_detach_final
2868 	 * on these interfaces.
2869 	 */
2870 	dlil_incr_pending_thread_count();
2871 	if (kernel_thread_start(ifnet_detacher_thread_func,
2872 	    NULL, &thread) != KERN_SUCCESS) {
2873 		panic_plain("%s: couldn't create detacher thread", __func__);
2874 		/* NOTREACHED */
2875 	}
2876 	thread_deallocate(thread);
2877 
2878 	/*
2879 	 * Wait for the created kernel threads for dlil to get
2880 	 * scheduled and run at least once before we proceed
2881 	 */
2882 	lck_mtx_lock(&dlil_thread_sync_lock);
2883 	while (dlil_pending_thread_cnt != 0) {
2884 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2885 		    "threads to get scheduled at least once.\n", __func__);
2886 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2887 		    (PZERO - 1), __func__, NULL);
2888 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2889 	}
2890 	lck_mtx_unlock(&dlil_thread_sync_lock);
2891 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2892 	    "scheduled at least once. Proceeding.\n", __func__);
2893 }
2894 
2895 static void
if_flt_monitor_busy(struct ifnet * ifp)2896 if_flt_monitor_busy(struct ifnet *ifp)
2897 {
2898 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2899 
2900 	++ifp->if_flt_busy;
2901 	VERIFY(ifp->if_flt_busy != 0);
2902 }
2903 
2904 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2905 if_flt_monitor_unbusy(struct ifnet *ifp)
2906 {
2907 	if_flt_monitor_leave(ifp);
2908 }
2909 
2910 static void
if_flt_monitor_enter(struct ifnet * ifp)2911 if_flt_monitor_enter(struct ifnet *ifp)
2912 {
2913 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2914 
2915 	while (ifp->if_flt_busy) {
2916 		++ifp->if_flt_waiters;
2917 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2918 		    (PZERO - 1), "if_flt_monitor", NULL);
2919 	}
2920 	if_flt_monitor_busy(ifp);
2921 }
2922 
2923 static void
if_flt_monitor_leave(struct ifnet * ifp)2924 if_flt_monitor_leave(struct ifnet *ifp)
2925 {
2926 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2927 
2928 	VERIFY(ifp->if_flt_busy != 0);
2929 	--ifp->if_flt_busy;
2930 
2931 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2932 		ifp->if_flt_waiters = 0;
2933 		wakeup(&ifp->if_flt_head);
2934 	}
2935 }
2936 
2937 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2938 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2939     interface_filter_t *filter_ref, u_int32_t flags)
2940 {
2941 	int retval = 0;
2942 	struct ifnet_filter *filter = NULL;
2943 
2944 	ifnet_head_lock_shared();
2945 
2946 	/* Check that the interface is in the global list */
2947 	if (!ifnet_lookup(ifp)) {
2948 		retval = ENXIO;
2949 		goto done;
2950 	}
2951 	if (!ifnet_is_attached(ifp, 1)) {
2952 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2953 		    __func__, if_name(ifp));
2954 		retval = ENXIO;
2955 		goto done;
2956 	}
2957 
2958 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2959 
2960 	/* refcnt held above during lookup */
2961 	filter->filt_flags = flags;
2962 	filter->filt_ifp = ifp;
2963 	filter->filt_cookie = if_filter->iff_cookie;
2964 	filter->filt_name = if_filter->iff_name;
2965 	filter->filt_protocol = if_filter->iff_protocol;
2966 	/*
2967 	 * Do not install filter callbacks for internal coproc interface
2968 	 * and for management interfaces
2969 	 */
2970 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2971 		filter->filt_input = if_filter->iff_input;
2972 		filter->filt_output = if_filter->iff_output;
2973 		filter->filt_event = if_filter->iff_event;
2974 		filter->filt_ioctl = if_filter->iff_ioctl;
2975 	}
2976 	filter->filt_detached = if_filter->iff_detached;
2977 
2978 	lck_mtx_lock(&ifp->if_flt_lock);
2979 	if_flt_monitor_enter(ifp);
2980 
2981 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2982 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2983 
2984 	*filter_ref = filter;
2985 
2986 	/*
2987 	 * Bump filter count and route_generation ID to let TCP
2988 	 * know it shouldn't do TSO on this connection
2989 	 */
2990 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2991 		ifnet_filter_update_tso(ifp, TRUE);
2992 	}
2993 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2994 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2995 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2996 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2997 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2998 	} else {
2999 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
3000 	}
3001 	if_flt_monitor_leave(ifp);
3002 	lck_mtx_unlock(&ifp->if_flt_lock);
3003 
3004 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3005 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3006 	    net_check_compatible_if_filter(NULL));
3007 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3008 
3009 	if (dlil_verbose) {
3010 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3011 		    if_filter->iff_name);
3012 	}
3013 	ifnet_decr_iorefcnt(ifp);
3014 
3015 done:
3016 	ifnet_head_done();
3017 	if (retval != 0 && ifp != NULL) {
3018 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3019 		    if_name(ifp), if_filter->iff_name, retval);
3020 	}
3021 	if (retval != 0 && filter != NULL) {
3022 		zfree(dlif_filt_zone, filter);
3023 	}
3024 
3025 	return retval;
3026 }
3027 
3028 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3029 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
3030 {
3031 	int retval = 0;
3032 
3033 	if (detached == 0) {
3034 		ifnet_t ifp = NULL;
3035 
3036 		ifnet_head_lock_shared();
3037 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3038 			interface_filter_t entry = NULL;
3039 
3040 			lck_mtx_lock(&ifp->if_flt_lock);
3041 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3042 				if (entry != filter || entry->filt_skip) {
3043 					continue;
3044 				}
3045 				/*
3046 				 * We've found a match; since it's possible
3047 				 * that the thread gets blocked in the monitor,
3048 				 * we do the lock dance.  Interface should
3049 				 * not be detached since we still have a use
3050 				 * count held during filter attach.
3051 				 */
3052 				entry->filt_skip = 1;   /* skip input/output */
3053 				lck_mtx_unlock(&ifp->if_flt_lock);
3054 				ifnet_head_done();
3055 
3056 				lck_mtx_lock(&ifp->if_flt_lock);
3057 				if_flt_monitor_enter(ifp);
3058 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
3059 				    LCK_MTX_ASSERT_OWNED);
3060 
3061 				/* Remove the filter from the list */
3062 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
3063 				    filt_next);
3064 
3065 				if (dlil_verbose) {
3066 					DLIL_PRINTF("%s: %s filter detached\n",
3067 					    if_name(ifp), filter->filt_name);
3068 				}
3069 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3070 					VERIFY(ifp->if_flt_non_os_count != 0);
3071 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3072 				}
3073 				/*
3074 				 * Decrease filter count and route_generation
3075 				 * ID to let TCP know it should reevalute doing
3076 				 * TSO or not.
3077 				 */
3078 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3079 					ifnet_filter_update_tso(ifp, FALSE);
3080 				}
3081 				if_flt_monitor_leave(ifp);
3082 				lck_mtx_unlock(&ifp->if_flt_lock);
3083 				goto destroy;
3084 			}
3085 			lck_mtx_unlock(&ifp->if_flt_lock);
3086 		}
3087 		ifnet_head_done();
3088 
3089 		/* filter parameter is not a valid filter ref */
3090 		retval = EINVAL;
3091 		goto done;
3092 	} else {
3093 		struct ifnet *ifp = filter->filt_ifp;
3094 		/*
3095 		 * Here we are called from ifnet_detach_final(); the
3096 		 * caller had emptied if_flt_head and we're doing an
3097 		 * implicit filter detach because the interface is
3098 		 * about to go away.  Make sure to adjust the counters
3099 		 * in this case.  We don't need the protection of the
3100 		 * filter monitor since we're called as part of the
3101 		 * final detach in the context of the detacher thread.
3102 		 */
3103 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3104 			VERIFY(ifp->if_flt_non_os_count != 0);
3105 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3106 		}
3107 		/*
3108 		 * Decrease filter count and route_generation
3109 		 * ID to let TCP know it should reevalute doing
3110 		 * TSO or not.
3111 		 */
3112 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3113 			ifnet_filter_update_tso(ifp, FALSE);
3114 		}
3115 	}
3116 
3117 	if (dlil_verbose) {
3118 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3119 	}
3120 
3121 destroy:
3122 
3123 	/* Call the detached function if there is one */
3124 	if (filter->filt_detached) {
3125 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3126 	}
3127 
3128 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3129 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3130 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3131 	}
3132 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3133 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3134 	    net_check_compatible_if_filter(NULL));
3135 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3136 
3137 	/* Free the filter */
3138 	zfree(dlif_filt_zone, filter);
3139 	filter = NULL;
3140 done:
3141 	if (retval != 0 && filter != NULL) {
3142 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3143 		    filter->filt_name, retval);
3144 	}
3145 
3146 	return retval;
3147 }
3148 
3149 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3150 dlil_detach_filter(interface_filter_t filter)
3151 {
3152 	if (filter == NULL) {
3153 		return;
3154 	}
3155 	dlil_detach_filter_internal(filter, 0);
3156 }
3157 
3158 __private_extern__ boolean_t
dlil_has_ip_filter(void)3159 dlil_has_ip_filter(void)
3160 {
3161 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3162 
3163 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3164 
3165 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3166 	return has_filter;
3167 }
3168 
3169 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3170 dlil_has_if_filter(struct ifnet *ifp)
3171 {
3172 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3173 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3174 	return has_filter;
3175 }
3176 
3177 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3178 dlil_input_wakeup(struct dlil_threading_info *inp)
3179 {
3180 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3181 
3182 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3183 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3184 		inp->dlth_wtot++;
3185 		wakeup_one((caddr_t)&inp->dlth_flags);
3186 	}
3187 }
3188 
3189 __attribute__((noreturn))
3190 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3191 dlil_main_input_thread_func(void *v, wait_result_t w)
3192 {
3193 #pragma unused(w)
3194 	struct dlil_threading_info *inp = v;
3195 
3196 	VERIFY(inp == dlil_main_input_thread);
3197 	VERIFY(inp->dlth_ifp == NULL);
3198 	VERIFY(current_thread() == inp->dlth_thread);
3199 
3200 	lck_mtx_lock(&inp->dlth_lock);
3201 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3202 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3203 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3204 	/* wake up once to get out of embryonic state */
3205 	dlil_input_wakeup(inp);
3206 	lck_mtx_unlock(&inp->dlth_lock);
3207 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3208 	/* NOTREACHED */
3209 	__builtin_unreachable();
3210 }
3211 
3212 /*
3213  * Main input thread:
3214  *
3215  *   a) handles all inbound packets for lo0
3216  *   b) handles all inbound packets for interfaces with no dedicated
3217  *	input thread (e.g. anything but Ethernet/PDP or those that support
3218  *	opportunistic polling.)
3219  *   c) protocol registrations
3220  *   d) packet injections
3221  */
3222 __attribute__((noreturn))
3223 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3224 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3225 {
3226 	struct dlil_main_threading_info *inpm = v;
3227 	struct dlil_threading_info *inp = v;
3228 
3229 	/* main input thread is uninterruptible */
3230 	VERIFY(wres != THREAD_INTERRUPTED);
3231 	lck_mtx_lock_spin(&inp->dlth_lock);
3232 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3233 	    DLIL_INPUT_RUNNING)));
3234 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3235 
3236 	while (1) {
3237 		struct mbuf *m = NULL, *m_loop = NULL;
3238 		u_int32_t m_cnt, m_cnt_loop;
3239 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3240 		boolean_t proto_req;
3241 		boolean_t embryonic;
3242 
3243 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3244 
3245 		if (__improbable(embryonic =
3246 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3247 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3248 		}
3249 
3250 		proto_req = (inp->dlth_flags &
3251 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3252 
3253 		/* Packets for non-dedicated interfaces other than lo0 */
3254 		m_cnt = qlen(&inp->dlth_pkts);
3255 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3256 		m = pkt.cp_mbuf;
3257 
3258 		/* Packets exclusive to lo0 */
3259 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3260 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3261 		m_loop = pkt.cp_mbuf;
3262 
3263 		inp->dlth_wtot = 0;
3264 
3265 		lck_mtx_unlock(&inp->dlth_lock);
3266 
3267 		if (__improbable(embryonic)) {
3268 			dlil_decr_pending_thread_count();
3269 		}
3270 
3271 		/*
3272 		 * NOTE warning %%% attention !!!!
3273 		 * We should think about putting some thread starvation
3274 		 * safeguards if we deal with long chains of packets.
3275 		 */
3276 		if (__probable(m_loop != NULL)) {
3277 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3278 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3279 		}
3280 
3281 		if (__probable(m != NULL)) {
3282 			dlil_input_packet_list_extended(NULL, m,
3283 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3284 		}
3285 
3286 		if (__improbable(proto_req)) {
3287 			proto_input_run();
3288 		}
3289 
3290 		lck_mtx_lock_spin(&inp->dlth_lock);
3291 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3292 		/* main input thread cannot be terminated */
3293 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3294 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3295 			break;
3296 		}
3297 	}
3298 
3299 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3300 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3301 	lck_mtx_unlock(&inp->dlth_lock);
3302 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3303 
3304 	VERIFY(0);      /* we should never get here */
3305 	/* NOTREACHED */
3306 	__builtin_unreachable();
3307 }
3308 
3309 /*
3310  * Input thread for interfaces with legacy input model.
3311  */
3312 __attribute__((noreturn))
3313 static void
dlil_input_thread_func(void * v,wait_result_t w)3314 dlil_input_thread_func(void *v, wait_result_t w)
3315 {
3316 #pragma unused(w)
3317 	char thread_name[MAXTHREADNAMESIZE];
3318 	struct dlil_threading_info *inp = v;
3319 	struct ifnet *ifp = inp->dlth_ifp;
3320 
3321 	VERIFY(inp != dlil_main_input_thread);
3322 	VERIFY(ifp != NULL);
3323 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3324 	    !(ifp->if_xflags & IFXF_LEGACY));
3325 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3326 	    !(ifp->if_xflags & IFXF_LEGACY));
3327 	VERIFY(current_thread() == inp->dlth_thread);
3328 
3329 	/* construct the name for this thread, and then apply it */
3330 	bzero(thread_name, sizeof(thread_name));
3331 	(void) snprintf(thread_name, sizeof(thread_name),
3332 	    "dlil_input_%s", ifp->if_xname);
3333 	thread_set_thread_name(inp->dlth_thread, thread_name);
3334 
3335 	lck_mtx_lock(&inp->dlth_lock);
3336 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3337 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3338 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3339 	/* wake up once to get out of embryonic state */
3340 	dlil_input_wakeup(inp);
3341 	lck_mtx_unlock(&inp->dlth_lock);
3342 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3343 	/* NOTREACHED */
3344 	__builtin_unreachable();
3345 }
3346 
3347 __attribute__((noreturn))
3348 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3349 dlil_input_thread_cont(void *v, wait_result_t wres)
3350 {
3351 	struct dlil_threading_info *inp = v;
3352 	struct ifnet *ifp = inp->dlth_ifp;
3353 
3354 	lck_mtx_lock_spin(&inp->dlth_lock);
3355 	if (__improbable(wres == THREAD_INTERRUPTED ||
3356 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3357 		goto terminate;
3358 	}
3359 
3360 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3361 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3362 
3363 	while (1) {
3364 		struct mbuf *m = NULL;
3365 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3366 		boolean_t notify = FALSE;
3367 		boolean_t embryonic;
3368 		u_int32_t m_cnt;
3369 
3370 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3371 
3372 		if (__improbable(embryonic =
3373 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3374 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3375 		}
3376 
3377 		/*
3378 		 * Protocol registration and injection must always use
3379 		 * the main input thread; in theory the latter can utilize
3380 		 * the corresponding input thread where the packet arrived
3381 		 * on, but that requires our knowing the interface in advance
3382 		 * (and the benefits might not worth the trouble.)
3383 		 */
3384 		VERIFY(!(inp->dlth_flags &
3385 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3386 
3387 		/* Packets for this interface */
3388 		m_cnt = qlen(&inp->dlth_pkts);
3389 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3390 		m = pkt.cp_mbuf;
3391 
3392 		inp->dlth_wtot = 0;
3393 
3394 #if SKYWALK
3395 		/*
3396 		 * If this interface is attached to a netif nexus,
3397 		 * the stats are already incremented there; otherwise
3398 		 * do it here.
3399 		 */
3400 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3401 #endif /* SKYWALK */
3402 		notify = dlil_input_stats_sync(ifp, inp);
3403 
3404 		lck_mtx_unlock(&inp->dlth_lock);
3405 
3406 		if (__improbable(embryonic)) {
3407 			ifnet_decr_pending_thread_count(ifp);
3408 		}
3409 
3410 		if (__improbable(notify)) {
3411 			ifnet_notify_data_threshold(ifp);
3412 		}
3413 
3414 		/*
3415 		 * NOTE warning %%% attention !!!!
3416 		 * We should think about putting some thread starvation
3417 		 * safeguards if we deal with long chains of packets.
3418 		 */
3419 		if (__probable(m != NULL)) {
3420 			dlil_input_packet_list_extended(NULL, m,
3421 			    m_cnt, ifp->if_poll_mode);
3422 		}
3423 
3424 		lck_mtx_lock_spin(&inp->dlth_lock);
3425 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3426 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3427 		    DLIL_INPUT_TERMINATE))) {
3428 			break;
3429 		}
3430 	}
3431 
3432 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3433 
3434 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3435 terminate:
3436 		lck_mtx_unlock(&inp->dlth_lock);
3437 		dlil_terminate_input_thread(inp);
3438 		/* NOTREACHED */
3439 	} else {
3440 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3441 		lck_mtx_unlock(&inp->dlth_lock);
3442 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3443 		/* NOTREACHED */
3444 	}
3445 
3446 	VERIFY(0);      /* we should never get here */
3447 	/* NOTREACHED */
3448 	__builtin_unreachable();
3449 }
3450 
3451 /*
3452  * Input thread for interfaces with opportunistic polling input model.
3453  */
3454 __attribute__((noreturn))
3455 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3456 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3457 {
3458 #pragma unused(w)
3459 	char thread_name[MAXTHREADNAMESIZE];
3460 	struct dlil_threading_info *inp = v;
3461 	struct ifnet *ifp = inp->dlth_ifp;
3462 
3463 	VERIFY(inp != dlil_main_input_thread);
3464 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3465 	    (ifp->if_xflags & IFXF_LEGACY));
3466 	VERIFY(current_thread() == inp->dlth_thread);
3467 
3468 	/* construct the name for this thread, and then apply it */
3469 	bzero(thread_name, sizeof(thread_name));
3470 	(void) snprintf(thread_name, sizeof(thread_name),
3471 	    "dlil_input_poll_%s", ifp->if_xname);
3472 	thread_set_thread_name(inp->dlth_thread, thread_name);
3473 
3474 	lck_mtx_lock(&inp->dlth_lock);
3475 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3476 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3477 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3478 	/* wake up once to get out of embryonic state */
3479 	dlil_input_wakeup(inp);
3480 	lck_mtx_unlock(&inp->dlth_lock);
3481 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3482 	/* NOTREACHED */
3483 	__builtin_unreachable();
3484 }
3485 
3486 __attribute__((noreturn))
3487 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3488 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3489 {
3490 	struct dlil_threading_info *inp = v;
3491 	struct ifnet *ifp = inp->dlth_ifp;
3492 	struct timespec ts;
3493 
3494 	lck_mtx_lock_spin(&inp->dlth_lock);
3495 	if (__improbable(wres == THREAD_INTERRUPTED ||
3496 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3497 		goto terminate;
3498 	}
3499 
3500 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3501 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3502 
3503 	while (1) {
3504 		struct mbuf *m = NULL;
3505 		uint32_t m_cnt, poll_req = 0;
3506 		uint64_t m_size = 0;
3507 		ifnet_model_t mode;
3508 		struct timespec now, delta;
3509 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3510 		boolean_t notify;
3511 		boolean_t embryonic;
3512 		uint64_t ival;
3513 
3514 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3515 
3516 		if (__improbable(embryonic =
3517 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3518 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3519 			goto skip;
3520 		}
3521 
3522 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3523 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3524 		}
3525 
3526 		/* Link parameters changed? */
3527 		if (ifp->if_poll_update != 0) {
3528 			ifp->if_poll_update = 0;
3529 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3530 		}
3531 
3532 		/* Current operating mode */
3533 		mode = ifp->if_poll_mode;
3534 
3535 		/*
3536 		 * Protocol registration and injection must always use
3537 		 * the main input thread; in theory the latter can utilize
3538 		 * the corresponding input thread where the packet arrived
3539 		 * on, but that requires our knowing the interface in advance
3540 		 * (and the benefits might not worth the trouble.)
3541 		 */
3542 		VERIFY(!(inp->dlth_flags &
3543 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3544 
3545 		/* Total count of all packets */
3546 		m_cnt = qlen(&inp->dlth_pkts);
3547 
3548 		/* Total bytes of all packets */
3549 		m_size = qsize(&inp->dlth_pkts);
3550 
3551 		/* Packets for this interface */
3552 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3553 		m = pkt.cp_mbuf;
3554 		VERIFY(m != NULL || m_cnt == 0);
3555 
3556 		nanouptime(&now);
3557 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3558 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3559 		}
3560 
3561 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3562 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3563 			u_int32_t ptot, btot;
3564 
3565 			/* Accumulate statistics for current sampling */
3566 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3567 
3568 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3569 				goto skip;
3570 			}
3571 
3572 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3573 
3574 			/* Calculate min/max of inbound bytes */
3575 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3576 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3577 				ifp->if_rxpoll_bmin = btot;
3578 			}
3579 			if (btot > ifp->if_rxpoll_bmax) {
3580 				ifp->if_rxpoll_bmax = btot;
3581 			}
3582 
3583 			/* Calculate EWMA of inbound bytes */
3584 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3585 
3586 			/* Calculate min/max of inbound packets */
3587 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3588 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3589 				ifp->if_rxpoll_pmin = ptot;
3590 			}
3591 			if (ptot > ifp->if_rxpoll_pmax) {
3592 				ifp->if_rxpoll_pmax = ptot;
3593 			}
3594 
3595 			/* Calculate EWMA of inbound packets */
3596 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3597 
3598 			/* Reset sampling statistics */
3599 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3600 
3601 			/* Calculate EWMA of wakeup requests */
3602 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3603 			    if_rxpoll_decay);
3604 			inp->dlth_wtot = 0;
3605 
3606 			if (dlil_verbose) {
3607 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3608 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3609 				}
3610 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3611 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3612 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3613 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3614 					    "limits [%d/%d], wreq avg %d "
3615 					    "limits [%d/%d], bytes avg %d "
3616 					    "limits [%d/%d]\n", if_name(ifp),
3617 					    (ifp->if_poll_mode ==
3618 					    IFNET_MODEL_INPUT_POLL_ON) ?
3619 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3620 					    ifp->if_rxpoll_pmax,
3621 					    ifp->if_rxpoll_plowat,
3622 					    ifp->if_rxpoll_phiwat,
3623 					    ifp->if_rxpoll_wavg,
3624 					    ifp->if_rxpoll_wlowat,
3625 					    ifp->if_rxpoll_whiwat,
3626 					    ifp->if_rxpoll_bavg,
3627 					    ifp->if_rxpoll_blowat,
3628 					    ifp->if_rxpoll_bhiwat);
3629 				}
3630 			}
3631 
3632 			/* Perform mode transition, if necessary */
3633 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3634 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3635 			}
3636 
3637 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3638 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3639 				goto skip;
3640 			}
3641 
3642 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3643 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3644 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3645 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3646 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3647 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3648 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3649 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3650 				mode = IFNET_MODEL_INPUT_POLL_ON;
3651 			}
3652 
3653 			if (mode != ifp->if_poll_mode) {
3654 				ifp->if_poll_mode = mode;
3655 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3656 				poll_req++;
3657 			}
3658 		}
3659 skip:
3660 		notify = dlil_input_stats_sync(ifp, inp);
3661 
3662 		lck_mtx_unlock(&inp->dlth_lock);
3663 
3664 		if (__improbable(embryonic)) {
3665 			ifnet_decr_pending_thread_count(ifp);
3666 		}
3667 
3668 		if (__improbable(notify)) {
3669 			ifnet_notify_data_threshold(ifp);
3670 		}
3671 
3672 		/*
3673 		 * If there's a mode change and interface is still attached,
3674 		 * perform a downcall to the driver for the new mode.  Also
3675 		 * hold an IO refcnt on the interface to prevent it from
3676 		 * being detached (will be release below.)
3677 		 */
3678 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3679 			struct ifnet_model_params p = {
3680 				.model = mode, .reserved = { 0 }
3681 			};
3682 			errno_t err;
3683 
3684 			if (dlil_verbose) {
3685 				DLIL_PRINTF("%s: polling is now %s, "
3686 				    "pkts avg %d max %d limits [%d/%d], "
3687 				    "wreq avg %d limits [%d/%d], "
3688 				    "bytes avg %d limits [%d/%d]\n",
3689 				    if_name(ifp),
3690 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3691 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3692 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3693 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3694 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3695 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3696 				    ifp->if_rxpoll_bhiwat);
3697 			}
3698 
3699 			if ((err = ((*ifp->if_input_ctl)(ifp,
3700 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3701 				DLIL_PRINTF("%s: error setting polling mode "
3702 				    "to %s (%d)\n", if_name(ifp),
3703 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3704 				    "ON" : "OFF", err);
3705 			}
3706 
3707 			switch (mode) {
3708 			case IFNET_MODEL_INPUT_POLL_OFF:
3709 				ifnet_set_poll_cycle(ifp, NULL);
3710 				ifp->if_rxpoll_offreq++;
3711 				if (err != 0) {
3712 					ifp->if_rxpoll_offerr++;
3713 				}
3714 				break;
3715 
3716 			case IFNET_MODEL_INPUT_POLL_ON:
3717 				net_nsectimer(&ival, &ts);
3718 				ifnet_set_poll_cycle(ifp, &ts);
3719 				ifnet_poll(ifp);
3720 				ifp->if_rxpoll_onreq++;
3721 				if (err != 0) {
3722 					ifp->if_rxpoll_onerr++;
3723 				}
3724 				break;
3725 
3726 			default:
3727 				VERIFY(0);
3728 				/* NOTREACHED */
3729 			}
3730 
3731 			/* Release the IO refcnt */
3732 			ifnet_decr_iorefcnt(ifp);
3733 		}
3734 
3735 		/*
3736 		 * NOTE warning %%% attention !!!!
3737 		 * We should think about putting some thread starvation
3738 		 * safeguards if we deal with long chains of packets.
3739 		 */
3740 		if (__probable(m != NULL)) {
3741 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3742 		}
3743 
3744 		lck_mtx_lock_spin(&inp->dlth_lock);
3745 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3746 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3747 		    DLIL_INPUT_TERMINATE))) {
3748 			break;
3749 		}
3750 	}
3751 
3752 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3753 
3754 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3755 terminate:
3756 		lck_mtx_unlock(&inp->dlth_lock);
3757 		dlil_terminate_input_thread(inp);
3758 		/* NOTREACHED */
3759 	} else {
3760 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3761 		lck_mtx_unlock(&inp->dlth_lock);
3762 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3763 		    inp);
3764 		/* NOTREACHED */
3765 	}
3766 
3767 	VERIFY(0);      /* we should never get here */
3768 	/* NOTREACHED */
3769 	__builtin_unreachable();
3770 }
3771 
3772 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3773 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3774 {
3775 	if (p != NULL) {
3776 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3777 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3778 			return EINVAL;
3779 		}
3780 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3781 		    p->packets_lowat >= p->packets_hiwat) {
3782 			return EINVAL;
3783 		}
3784 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3785 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3786 			return EINVAL;
3787 		}
3788 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3789 		    p->bytes_lowat >= p->bytes_hiwat) {
3790 			return EINVAL;
3791 		}
3792 		if (p->interval_time != 0 &&
3793 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3794 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3795 		}
3796 	}
3797 	return 0;
3798 }
3799 
3800 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3801 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3802 {
3803 	u_int64_t sample_holdtime, inbw;
3804 
3805 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3806 		sample_holdtime = 0;    /* polling is disabled */
3807 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3808 		    ifp->if_rxpoll_blowat = 0;
3809 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3810 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3811 		ifp->if_rxpoll_plim = 0;
3812 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3813 	} else {
3814 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3815 		u_int64_t ival;
3816 		unsigned int n, i;
3817 
3818 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3819 			if (inbw < rxpoll_tbl[i].speed) {
3820 				break;
3821 			}
3822 			n = i;
3823 		}
3824 		/* auto-tune if caller didn't specify a value */
3825 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3826 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3827 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3828 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3829 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3830 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3831 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3832 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3833 		plim = ((p == NULL || p->packets_limit == 0 ||
3834 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3835 		ival = ((p == NULL || p->interval_time == 0 ||
3836 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3837 		    if_rxpoll_interval_time : p->interval_time);
3838 
3839 		VERIFY(plowat != 0 && phiwat != 0);
3840 		VERIFY(blowat != 0 && bhiwat != 0);
3841 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3842 
3843 		sample_holdtime = if_rxpoll_sample_holdtime;
3844 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3845 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3846 		ifp->if_rxpoll_plowat = plowat;
3847 		ifp->if_rxpoll_phiwat = phiwat;
3848 		ifp->if_rxpoll_blowat = blowat;
3849 		ifp->if_rxpoll_bhiwat = bhiwat;
3850 		ifp->if_rxpoll_plim = plim;
3851 		ifp->if_rxpoll_ival = ival;
3852 	}
3853 
3854 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3855 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3856 
3857 	if (dlil_verbose) {
3858 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3859 		    "poll interval %llu nsec, pkts per poll %u, "
3860 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3861 		    "bytes limits [%u/%u]\n", if_name(ifp),
3862 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3863 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3864 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3865 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3866 		    ifp->if_rxpoll_bhiwat);
3867 	}
3868 }
3869 
3870 /*
3871  * Must be called on an attached ifnet (caller is expected to check.)
3872  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3873  */
3874 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3875 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3876     boolean_t locked)
3877 {
3878 	errno_t err;
3879 	struct dlil_threading_info *inp;
3880 
3881 	VERIFY(ifp != NULL);
3882 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3883 		return ENXIO;
3884 	}
3885 	err = dlil_rxpoll_validate_params(p);
3886 	if (err != 0) {
3887 		return err;
3888 	}
3889 
3890 	if (!locked) {
3891 		lck_mtx_lock(&inp->dlth_lock);
3892 	}
3893 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3894 	/*
3895 	 * Normally, we'd reset the parameters to the auto-tuned values
3896 	 * if the the input thread detects a change in link rate.  If the
3897 	 * driver provides its own parameters right after a link rate
3898 	 * changes, but before the input thread gets to run, we want to
3899 	 * make sure to keep the driver's values.  Clearing if_poll_update
3900 	 * will achieve that.
3901 	 */
3902 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3903 		ifp->if_poll_update = 0;
3904 	}
3905 	dlil_rxpoll_update_params(ifp, p);
3906 	if (!locked) {
3907 		lck_mtx_unlock(&inp->dlth_lock);
3908 	}
3909 	return 0;
3910 }
3911 
3912 /*
3913  * Must be called on an attached ifnet (caller is expected to check.)
3914  */
3915 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3916 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3917 {
3918 	struct dlil_threading_info *inp;
3919 
3920 	VERIFY(ifp != NULL && p != NULL);
3921 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3922 		return ENXIO;
3923 	}
3924 
3925 	bzero(p, sizeof(*p));
3926 
3927 	lck_mtx_lock(&inp->dlth_lock);
3928 	p->packets_limit = ifp->if_rxpoll_plim;
3929 	p->packets_lowat = ifp->if_rxpoll_plowat;
3930 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3931 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3932 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3933 	p->interval_time = ifp->if_rxpoll_ival;
3934 	lck_mtx_unlock(&inp->dlth_lock);
3935 
3936 	return 0;
3937 }
3938 
3939 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3940 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3941     const struct ifnet_stat_increment_param *s)
3942 {
3943 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3944 }
3945 
3946 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3947 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3948     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3949 {
3950 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3951 }
3952 
3953 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3954 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3955     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3956 {
3957 	return ifnet_input_common(ifp, m_head, m_tail, s,
3958 	           (m_head != NULL), TRUE);
3959 }
3960 
3961 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3962 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3964 {
3965 	dlil_input_func input_func;
3966 	struct ifnet_stat_increment_param _s;
3967 	u_int32_t m_cnt = 0, m_size = 0;
3968 	struct mbuf *last;
3969 	errno_t err = 0;
3970 
3971 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3972 		if (m_head != NULL) {
3973 			mbuf_freem_list(m_head);
3974 		}
3975 		return EINVAL;
3976 	}
3977 
3978 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3979 	VERIFY(m_tail == NULL || ext);
3980 	VERIFY(s != NULL || !ext);
3981 
3982 	/*
3983 	 * Drop the packet(s) if the parameters are invalid, or if the
3984 	 * interface is no longer attached; else hold an IO refcnt to
3985 	 * prevent it from being detached (will be released below.)
3986 	 */
3987 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3988 		if (m_head != NULL) {
3989 			mbuf_freem_list(m_head);
3990 		}
3991 		return EINVAL;
3992 	}
3993 
3994 	input_func = ifp->if_input_dlil;
3995 	VERIFY(input_func != NULL);
3996 
3997 	if (m_tail == NULL) {
3998 		last = m_head;
3999 		while (m_head != NULL) {
4000 #if IFNET_INPUT_SANITY_CHK
4001 			if (__improbable(dlil_input_sanity_check != 0)) {
4002 				DLIL_INPUT_CHECK(last, ifp);
4003 			}
4004 #endif /* IFNET_INPUT_SANITY_CHK */
4005 			m_cnt++;
4006 			m_size += m_length(last);
4007 			if (mbuf_nextpkt(last) == NULL) {
4008 				break;
4009 			}
4010 			last = mbuf_nextpkt(last);
4011 		}
4012 		m_tail = last;
4013 	} else {
4014 #if IFNET_INPUT_SANITY_CHK
4015 		if (__improbable(dlil_input_sanity_check != 0)) {
4016 			last = m_head;
4017 			while (1) {
4018 				DLIL_INPUT_CHECK(last, ifp);
4019 				m_cnt++;
4020 				m_size += m_length(last);
4021 				if (mbuf_nextpkt(last) == NULL) {
4022 					break;
4023 				}
4024 				last = mbuf_nextpkt(last);
4025 			}
4026 		} else {
4027 			m_cnt = s->packets_in;
4028 			m_size = s->bytes_in;
4029 			last = m_tail;
4030 		}
4031 #else
4032 		m_cnt = s->packets_in;
4033 		m_size = s->bytes_in;
4034 		last = m_tail;
4035 #endif /* IFNET_INPUT_SANITY_CHK */
4036 	}
4037 
4038 	if (last != m_tail) {
4039 		panic_plain("%s: invalid input packet chain for %s, "
4040 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4041 		    m_tail, last);
4042 	}
4043 
4044 	/*
4045 	 * Assert packet count only for the extended variant, for backwards
4046 	 * compatibility, since this came directly from the device driver.
4047 	 * Relax this assertion for input bytes, as the driver may have
4048 	 * included the link-layer headers in the computation; hence
4049 	 * m_size is just an approximation.
4050 	 */
4051 	if (ext && s->packets_in != m_cnt) {
4052 		panic_plain("%s: input packet count mismatch for %s, "
4053 		    "%d instead of %d\n", __func__, if_name(ifp),
4054 		    s->packets_in, m_cnt);
4055 	}
4056 
4057 	if (s == NULL) {
4058 		bzero(&_s, sizeof(_s));
4059 		s = &_s;
4060 	} else {
4061 		_s = *s;
4062 	}
4063 	_s.packets_in = m_cnt;
4064 	_s.bytes_in = m_size;
4065 
4066 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4067 
4068 	if (ifp != lo_ifp) {
4069 		/* Release the IO refcnt */
4070 		ifnet_datamov_end(ifp);
4071 	}
4072 
4073 	return err;
4074 }
4075 
4076 #if SKYWALK
4077 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4078 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4079 {
4080 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4081 	           ptrauth_nop_cast(void *, &dlil_input_handler),
4082 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4083 }
4084 
4085 void
dlil_reset_input_handler(struct ifnet * ifp)4086 dlil_reset_input_handler(struct ifnet *ifp)
4087 {
4088 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4089 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
4090 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4091 		;
4092 	}
4093 }
4094 
4095 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4096 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4097 {
4098 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4099 	           ptrauth_nop_cast(void *, &dlil_output_handler),
4100 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4101 }
4102 
4103 void
dlil_reset_output_handler(struct ifnet * ifp)4104 dlil_reset_output_handler(struct ifnet *ifp)
4105 {
4106 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4107 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
4108 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4109 		;
4110 	}
4111 }
4112 #endif /* SKYWALK */
4113 
4114 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4115 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4116 {
4117 	return ifp->if_output(ifp, m);
4118 }
4119 
4120 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4121 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4122     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4123     boolean_t poll, struct thread *tp)
4124 {
4125 	struct dlil_threading_info *inp = ifp->if_inp;
4126 
4127 	if (__improbable(inp == NULL)) {
4128 		inp = dlil_main_input_thread;
4129 	}
4130 
4131 #if (DEVELOPMENT || DEBUG)
4132 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4133 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4134 	} else
4135 #endif /* (DEVELOPMENT || DEBUG) */
4136 	{
4137 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4138 	}
4139 }
4140 
4141 /*
4142  * Detect whether a queue contains a burst that needs to be trimmed.
4143  */
4144 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
4145 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
4146 	                        qtype(q) == QP_MBUF)
4147 
4148 #define MAX_KNOWN_MBUF_CLASS 8
4149 
4150 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4151 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4152     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4153 {
4154 	uint32_t overcommitted_qlen;    /* Length in packets. */
4155 	uint64_t overcommitted_qsize;   /* Size in bytes. */
4156 	uint32_t target_qlen;           /* The desired queue length after trimming. */
4157 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
4158 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
4159 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
4160 	struct mbuf *m = NULL, *m_tmp = NULL;
4161 
4162 	overcommitted_qlen = qlen(input_queue);
4163 	overcommitted_qsize = qsize(input_queue);
4164 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4165 
4166 	if (overcommitted_qlen <= target_qlen) {
4167 		/*
4168 		 * The queue is already within the target limits.
4169 		 */
4170 		dropped_pkts = 0;
4171 		goto out;
4172 	}
4173 
4174 	pkts_to_drop = overcommitted_qlen - target_qlen;
4175 
4176 	/*
4177 	 * Proceed to removing packets from the head of the queue,
4178 	 * starting from the oldest, until the desired number of packets
4179 	 * has been dropped.
4180 	 */
4181 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4182 		if (pkts_to_drop <= dropped_pkts) {
4183 			break;
4184 		}
4185 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
4186 		MBUFQ_NEXT(m) = NULL;
4187 		MBUFQ_ENQUEUE(freeq, m);
4188 
4189 		dropped_pkts += 1;
4190 		dropped_bytes += m_length(m);
4191 	}
4192 
4193 	/*
4194 	 * Adjust the length and the estimated size of the queue
4195 	 * after trimming.
4196 	 */
4197 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4198 	qlen(input_queue) = target_qlen;
4199 
4200 	/* qsize() is an approximation. */
4201 	if (dropped_bytes < qsize(input_queue)) {
4202 		qsize(input_queue) -= dropped_bytes;
4203 	} else {
4204 		qsize(input_queue) = 0;
4205 	}
4206 
4207 	/*
4208 	 * Adjust the ifnet statistics increments, if needed.
4209 	 */
4210 	stat_delta->dropped += dropped_pkts;
4211 	if (dropped_pkts < stat_delta->packets_in) {
4212 		stat_delta->packets_in -= dropped_pkts;
4213 	} else {
4214 		stat_delta->packets_in = 0;
4215 	}
4216 	if (dropped_bytes < stat_delta->bytes_in) {
4217 		stat_delta->bytes_in -= dropped_bytes;
4218 	} else {
4219 		stat_delta->bytes_in = 0;
4220 	}
4221 
4222 out:
4223 	if (dlil_verbose) {
4224 		/*
4225 		 * The basic information about the drop is logged
4226 		 * by the invoking function (dlil_input_{,a}sync).
4227 		 * If `dlil_verbose' flag is set, provide more information
4228 		 * that can be useful for debugging.
4229 		 */
4230 		DLIL_PRINTF("%s: "
4231 		    "qlen: %u -> %u, "
4232 		    "qsize: %llu -> %llu "
4233 		    "qlimit: %u (sysctl: %u) "
4234 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4235 		    "dropped_pkts: %u dropped_bytes %u\n",
4236 		    __func__,
4237 		    overcommitted_qlen, qlen(input_queue),
4238 		    overcommitted_qsize, qsize(input_queue),
4239 		    qlimit(input_queue), if_rcvq_burst_limit,
4240 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4241 		    dropped_pkts, dropped_bytes);
4242 	}
4243 
4244 	return dropped_pkts;
4245 }
4246 
4247 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4248 dlil_input_async(struct dlil_threading_info *inp,
4249     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4250     const struct ifnet_stat_increment_param *s, boolean_t poll,
4251     struct thread *tp)
4252 {
4253 	u_int32_t m_cnt = s->packets_in;
4254 	u_int32_t m_size = s->bytes_in;
4255 	boolean_t notify = FALSE;
4256 	struct ifnet_stat_increment_param s_adj = *s;
4257 	dlil_freeq_t freeq;
4258 	MBUFQ_INIT(&freeq);
4259 
4260 	/*
4261 	 * If there is a matching DLIL input thread associated with an
4262 	 * affinity set, associate this thread with the same set.  We
4263 	 * will only do this once.
4264 	 */
4265 	lck_mtx_lock_spin(&inp->dlth_lock);
4266 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4267 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4268 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4269 		u_int32_t tag = inp->dlth_affinity_tag;
4270 
4271 		if (poll) {
4272 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 			inp->dlth_poller_thread = tp;
4274 		} else {
4275 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4276 			inp->dlth_driver_thread = tp;
4277 		}
4278 		lck_mtx_unlock(&inp->dlth_lock);
4279 
4280 		/* Associate the current thread with the new affinity tag */
4281 		(void) dlil_affinity_set(tp, tag);
4282 
4283 		/*
4284 		 * Take a reference on the current thread; during detach,
4285 		 * we will need to refer to it in order to tear down its
4286 		 * affinity.
4287 		 */
4288 		thread_reference(tp);
4289 		lck_mtx_lock_spin(&inp->dlth_lock);
4290 	}
4291 
4292 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4293 
4294 	/*
4295 	 * Because of loopbacked multicast we cannot stuff the ifp in
4296 	 * the rcvif of the packet header: loopback (lo0) packets use a
4297 	 * dedicated list so that we can later associate them with lo_ifp
4298 	 * on their way up the stack.  Packets for other interfaces without
4299 	 * dedicated input threads go to the regular list.
4300 	 */
4301 	if (m_head != NULL) {
4302 		classq_pkt_t head, tail;
4303 		class_queue_t *input_queue;
4304 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4305 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4306 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4307 			struct dlil_main_threading_info *inpm =
4308 			    (struct dlil_main_threading_info *)inp;
4309 			input_queue = &inpm->lo_rcvq_pkts;
4310 		} else {
4311 			input_queue = &inp->dlth_pkts;
4312 		}
4313 
4314 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4315 
4316 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4317 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4318 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
4319 			inp->dlth_trim_cnt += 1;
4320 
4321 			os_log_error(OS_LOG_DEFAULT,
4322 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
4323 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
4324 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4325 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4326 			    qlen(input_queue));
4327 		}
4328 	}
4329 
4330 #if IFNET_INPUT_SANITY_CHK
4331 	/*
4332 	 * Verify that the original stat increment parameter
4333 	 * accurately describes the input chain `m_head`.
4334 	 * This is not affected by the trimming of input queue.
4335 	 */
4336 	if (__improbable(dlil_input_sanity_check != 0)) {
4337 		u_int32_t count = 0, size = 0;
4338 		struct mbuf *m0;
4339 
4340 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4341 			size += m_length(m0);
4342 			count++;
4343 		}
4344 
4345 		if (count != m_cnt) {
4346 			panic_plain("%s: invalid total packet count %u "
4347 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4348 			/* NOTREACHED */
4349 			__builtin_unreachable();
4350 		} else if (size != m_size) {
4351 			panic_plain("%s: invalid total packet size %u "
4352 			    "(expected %u)\n", if_name(ifp), size, m_size);
4353 			/* NOTREACHED */
4354 			__builtin_unreachable();
4355 		}
4356 
4357 		inp->dlth_pkts_cnt += m_cnt;
4358 	}
4359 #endif /* IFNET_INPUT_SANITY_CHK */
4360 
4361 	/* NOTE: use the adjusted parameter, vs the original one */
4362 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4363 	/*
4364 	 * If we're using the main input thread, synchronize the
4365 	 * stats now since we have the interface context.  All
4366 	 * other cases involving dedicated input threads will
4367 	 * have their stats synchronized there.
4368 	 */
4369 	if (inp == dlil_main_input_thread) {
4370 		notify = dlil_input_stats_sync(ifp, inp);
4371 	}
4372 
4373 	dlil_input_wakeup(inp);
4374 	lck_mtx_unlock(&inp->dlth_lock);
4375 
4376 	/*
4377 	 * Actual freeing of the excess packets must happen
4378 	 * after the dlth_lock had been released.
4379 	 */
4380 	if (!MBUFQ_EMPTY(&freeq)) {
4381 		m_freem_list(MBUFQ_FIRST(&freeq));
4382 	}
4383 
4384 	if (notify) {
4385 		ifnet_notify_data_threshold(ifp);
4386 	}
4387 
4388 	return 0;
4389 }
4390 
4391 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4392 dlil_input_sync(struct dlil_threading_info *inp,
4393     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4394     const struct ifnet_stat_increment_param *s, boolean_t poll,
4395     struct thread *tp)
4396 {
4397 #pragma unused(tp)
4398 	u_int32_t m_cnt = s->packets_in;
4399 	u_int32_t m_size = s->bytes_in;
4400 	boolean_t notify = FALSE;
4401 	classq_pkt_t head, tail;
4402 	struct ifnet_stat_increment_param s_adj = *s;
4403 	dlil_freeq_t freeq;
4404 	MBUFQ_INIT(&freeq);
4405 
4406 	ASSERT(inp != dlil_main_input_thread);
4407 
4408 	/* XXX: should we just assert instead? */
4409 	if (__improbable(m_head == NULL)) {
4410 		return 0;
4411 	}
4412 
4413 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4414 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4415 
4416 	lck_mtx_lock_spin(&inp->dlth_lock);
4417 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4418 
4419 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4420 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4421 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4422 		inp->dlth_trim_cnt += 1;
4423 
4424 		os_log_error(OS_LOG_DEFAULT,
4425 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4426 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4427 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4428 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4429 		    qlen(&inp->dlth_pkts));
4430 	}
4431 
4432 #if IFNET_INPUT_SANITY_CHK
4433 	if (__improbable(dlil_input_sanity_check != 0)) {
4434 		u_int32_t count = 0, size = 0;
4435 		struct mbuf *m0;
4436 
4437 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4438 			size += m_length(m0);
4439 			count++;
4440 		}
4441 
4442 		if (count != m_cnt) {
4443 			panic_plain("%s: invalid total packet count %u "
4444 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4445 			/* NOTREACHED */
4446 			__builtin_unreachable();
4447 		} else if (size != m_size) {
4448 			panic_plain("%s: invalid total packet size %u "
4449 			    "(expected %u)\n", if_name(ifp), size, m_size);
4450 			/* NOTREACHED */
4451 			__builtin_unreachable();
4452 		}
4453 
4454 		inp->dlth_pkts_cnt += m_cnt;
4455 	}
4456 #endif /* IFNET_INPUT_SANITY_CHK */
4457 
4458 	/* NOTE: use the adjusted parameter, vs the original one */
4459 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4460 
4461 	m_cnt = qlen(&inp->dlth_pkts);
4462 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4463 
4464 #if SKYWALK
4465 	/*
4466 	 * If this interface is attached to a netif nexus,
4467 	 * the stats are already incremented there; otherwise
4468 	 * do it here.
4469 	 */
4470 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4471 #endif /* SKYWALK */
4472 	notify = dlil_input_stats_sync(ifp, inp);
4473 
4474 	lck_mtx_unlock(&inp->dlth_lock);
4475 
4476 	/*
4477 	 * Actual freeing of the excess packets must happen
4478 	 * after the dlth_lock had been released.
4479 	 */
4480 	if (!MBUFQ_EMPTY(&freeq)) {
4481 		m_freem_list(MBUFQ_FIRST(&freeq));
4482 	}
4483 
4484 	if (notify) {
4485 		ifnet_notify_data_threshold(ifp);
4486 	}
4487 
4488 	/*
4489 	 * NOTE warning %%% attention !!!!
4490 	 * We should think about putting some thread starvation
4491 	 * safeguards if we deal with long chains of packets.
4492 	 */
4493 	if (head.cp_mbuf != NULL) {
4494 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4495 		    m_cnt, ifp->if_poll_mode);
4496 	}
4497 
4498 	return 0;
4499 }
4500 
4501 #if SKYWALK
4502 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4503 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4504 {
4505 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4506 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4507 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4508 }
4509 
4510 void
ifnet_reset_output_handler(struct ifnet * ifp)4511 ifnet_reset_output_handler(struct ifnet *ifp)
4512 {
4513 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4514 	    ptrauth_nop_cast(void *, ifp->if_output),
4515 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4516 		;
4517 	}
4518 }
4519 
4520 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4521 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4522 {
4523 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4524 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4525 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4526 }
4527 
4528 void
ifnet_reset_start_handler(struct ifnet * ifp)4529 ifnet_reset_start_handler(struct ifnet *ifp)
4530 {
4531 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4532 	    ptrauth_nop_cast(void *, ifp->if_start),
4533 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4534 		;
4535 	}
4536 }
4537 #endif /* SKYWALK */
4538 
4539 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4540 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4541 {
4542 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4543 		return;
4544 	}
4545 	/*
4546 	 * If the starter thread is inactive, signal it to do work,
4547 	 * unless the interface is being flow controlled from below,
4548 	 * e.g. a virtual interface being flow controlled by a real
4549 	 * network interface beneath it, or it's been disabled via
4550 	 * a call to ifnet_disable_output().
4551 	 */
4552 	lck_mtx_lock_spin(&ifp->if_start_lock);
4553 	if (ignore_delay) {
4554 		ifp->if_start_flags |= IFSF_NO_DELAY;
4555 	}
4556 	if (resetfc) {
4557 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4558 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4559 		lck_mtx_unlock(&ifp->if_start_lock);
4560 		return;
4561 	}
4562 	ifp->if_start_req++;
4563 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4564 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4565 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4566 	    ifp->if_start_delayed == 0)) {
4567 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4568 	}
4569 	lck_mtx_unlock(&ifp->if_start_lock);
4570 }
4571 
4572 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4573 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4574 {
4575 	ifp->if_start_pacemaker_time = tx_time;
4576 }
4577 
4578 void
ifnet_start(struct ifnet * ifp)4579 ifnet_start(struct ifnet *ifp)
4580 {
4581 	ifnet_start_common(ifp, FALSE, FALSE);
4582 }
4583 
4584 void
ifnet_start_ignore_delay(struct ifnet * ifp)4585 ifnet_start_ignore_delay(struct ifnet *ifp)
4586 {
4587 	ifnet_start_common(ifp, FALSE, TRUE);
4588 }
4589 
4590 __attribute__((noreturn))
4591 static void
ifnet_start_thread_func(void * v,wait_result_t w)4592 ifnet_start_thread_func(void *v, wait_result_t w)
4593 {
4594 #pragma unused(w)
4595 	struct ifnet *ifp = v;
4596 	char thread_name[MAXTHREADNAMESIZE];
4597 
4598 	/* Construct the name for this thread, and then apply it. */
4599 	bzero(thread_name, sizeof(thread_name));
4600 	(void) snprintf(thread_name, sizeof(thread_name),
4601 	    "ifnet_start_%s", ifp->if_xname);
4602 #if SKYWALK
4603 	/* override name for native Skywalk interface */
4604 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4605 		(void) snprintf(thread_name, sizeof(thread_name),
4606 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4607 	}
4608 #endif /* SKYWALK */
4609 	ASSERT(ifp->if_start_thread == current_thread());
4610 	thread_set_thread_name(current_thread(), thread_name);
4611 
4612 	/*
4613 	 * Treat the dedicated starter thread for lo0 as equivalent to
4614 	 * the driver workloop thread; if net_affinity is enabled for
4615 	 * the main input thread, associate this starter thread to it
4616 	 * by binding them with the same affinity tag.  This is done
4617 	 * only once (as we only have one lo_ifp which never goes away.)
4618 	 */
4619 	if (ifp == lo_ifp) {
4620 		struct dlil_threading_info *inp = dlil_main_input_thread;
4621 		struct thread *tp = current_thread();
4622 #if SKYWALK
4623 		/* native skywalk loopback not yet implemented */
4624 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4625 #endif /* SKYWALK */
4626 
4627 		lck_mtx_lock(&inp->dlth_lock);
4628 		if (inp->dlth_affinity) {
4629 			u_int32_t tag = inp->dlth_affinity_tag;
4630 
4631 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4632 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4633 			inp->dlth_driver_thread = tp;
4634 			lck_mtx_unlock(&inp->dlth_lock);
4635 
4636 			/* Associate this thread with the affinity tag */
4637 			(void) dlil_affinity_set(tp, tag);
4638 		} else {
4639 			lck_mtx_unlock(&inp->dlth_lock);
4640 		}
4641 	}
4642 
4643 	lck_mtx_lock(&ifp->if_start_lock);
4644 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4645 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4646 	ifp->if_start_embryonic = 1;
4647 	/* wake up once to get out of embryonic state */
4648 	ifp->if_start_req++;
4649 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4650 	lck_mtx_unlock(&ifp->if_start_lock);
4651 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4652 	/* NOTREACHED */
4653 	__builtin_unreachable();
4654 }
4655 
4656 __attribute__((noreturn))
4657 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4658 ifnet_start_thread_cont(void *v, wait_result_t wres)
4659 {
4660 	struct ifnet *ifp = v;
4661 	struct ifclassq *ifq = ifp->if_snd;
4662 
4663 	lck_mtx_lock_spin(&ifp->if_start_lock);
4664 	if (__improbable(wres == THREAD_INTERRUPTED ||
4665 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4666 		goto terminate;
4667 	}
4668 
4669 	if (__improbable(ifp->if_start_embryonic)) {
4670 		ifp->if_start_embryonic = 0;
4671 		lck_mtx_unlock(&ifp->if_start_lock);
4672 		ifnet_decr_pending_thread_count(ifp);
4673 		lck_mtx_lock_spin(&ifp->if_start_lock);
4674 		goto skip;
4675 	}
4676 
4677 	ifp->if_start_active = 1;
4678 
4679 	/*
4680 	 * Keep on servicing until no more request.
4681 	 */
4682 	for (;;) {
4683 		u_int32_t req = ifp->if_start_req;
4684 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4685 		    !IFCQ_IS_EMPTY(ifq) &&
4686 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4687 		    ifp->if_start_delayed == 0 &&
4688 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4689 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4690 			ifp->if_start_delayed = 1;
4691 			ifnet_start_delayed++;
4692 			break;
4693 		}
4694 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4695 		ifp->if_start_delayed = 0;
4696 		lck_mtx_unlock(&ifp->if_start_lock);
4697 
4698 		/*
4699 		 * If no longer attached, don't call start because ifp
4700 		 * is being destroyed; else hold an IO refcnt to
4701 		 * prevent the interface from being detached (will be
4702 		 * released below.)
4703 		 */
4704 		if (!ifnet_datamov_begin(ifp)) {
4705 			lck_mtx_lock_spin(&ifp->if_start_lock);
4706 			break;
4707 		}
4708 
4709 		/* invoke the driver's start routine */
4710 		((*ifp->if_start)(ifp));
4711 
4712 		/*
4713 		 * Release the io ref count taken above.
4714 		 */
4715 		ifnet_datamov_end(ifp);
4716 
4717 		lck_mtx_lock_spin(&ifp->if_start_lock);
4718 
4719 		/*
4720 		 * If there's no pending request or if the
4721 		 * interface has been disabled, we're done.
4722 		 */
4723 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4724 		if (req == ifp->if_start_req ||
4725 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4726 			break;
4727 		}
4728 	}
4729 skip:
4730 	ifp->if_start_req = 0;
4731 	ifp->if_start_active = 0;
4732 
4733 #if SKYWALK
4734 	/*
4735 	 * Wakeup any waiters, e.g. any threads waiting to
4736 	 * detach the interface from the flowswitch, etc.
4737 	 */
4738 	if (ifp->if_start_waiters != 0) {
4739 		ifp->if_start_waiters = 0;
4740 		wakeup(&ifp->if_start_waiters);
4741 	}
4742 #endif /* SKYWALK */
4743 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4744 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4745 		struct timespec delay_start_ts;
4746 		struct timespec pacemaker_ts;
4747 		struct timespec *ts = NULL;
4748 
4749 		/*
4750 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4751 		 * there are still packets in the send queue which haven't
4752 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4753 		 * until ifnet_start() is called again.
4754 		 */
4755 		if (ifp->if_start_pacemaker_time != 0) {
4756 			struct timespec now_ts;
4757 			uint64_t now;
4758 
4759 			nanouptime(&now_ts);
4760 			now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4761 
4762 			if (ifp->if_start_pacemaker_time != 0 &&
4763 			    ifp->if_start_pacemaker_time > now) {
4764 				pacemaker_ts.tv_sec = 0;
4765 				pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4766 
4767 				ts = &pacemaker_ts;
4768 				ifp->if_start_flags |= IFSF_NO_DELAY;
4769 				DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4770 				    uint64_t, pacemaker_ts.tv_nsec);
4771 			} else {
4772 				DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4773 				    uint64_t, now - ifp->if_start_pacemaker_time);
4774 				ifp->if_start_pacemaker_time = 0;
4775 				ifp->if_start_flags &= ~IFSF_NO_DELAY;
4776 			}
4777 		}
4778 
4779 		if (ts == NULL) {
4780 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4781 			    &ifp->if_start_cycle : NULL);
4782 		}
4783 
4784 		if (ts == NULL && ifp->if_start_delayed == 1) {
4785 			delay_start_ts.tv_sec = 0;
4786 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4787 			ts = &delay_start_ts;
4788 		}
4789 
4790 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4791 			ts = NULL;
4792 		}
4793 
4794 		if (__improbable(ts != NULL)) {
4795 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4796 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4797 		}
4798 
4799 		(void) assert_wait_deadline(&ifp->if_start_thread,
4800 		    THREAD_UNINT, deadline);
4801 		lck_mtx_unlock(&ifp->if_start_lock);
4802 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4803 		/* NOTREACHED */
4804 	} else {
4805 terminate:
4806 		/* interface is detached? */
4807 		ifnet_set_start_cycle(ifp, NULL);
4808 
4809 		ifp->if_start_pacemaker_time = 0;
4810 		/* clear if_start_thread to allow termination to continue */
4811 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4812 		ifp->if_start_thread = THREAD_NULL;
4813 		wakeup((caddr_t)&ifp->if_start_thread);
4814 		lck_mtx_unlock(&ifp->if_start_lock);
4815 
4816 		if (dlil_verbose) {
4817 			DLIL_PRINTF("%s: starter thread terminated\n",
4818 			    if_name(ifp));
4819 		}
4820 
4821 		/* for the extra refcnt from kernel_thread_start() */
4822 		thread_deallocate(current_thread());
4823 		/* this is the end */
4824 		thread_terminate(current_thread());
4825 		/* NOTREACHED */
4826 	}
4827 
4828 	/* must never get here */
4829 	VERIFY(0);
4830 	/* NOTREACHED */
4831 	__builtin_unreachable();
4832 }
4833 
4834 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4835 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4836 {
4837 	if (ts == NULL) {
4838 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4839 	} else {
4840 		*(&ifp->if_start_cycle) = *ts;
4841 	}
4842 
4843 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4844 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4845 		    if_name(ifp), ts->tv_nsec);
4846 	}
4847 }
4848 
4849 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4850 ifnet_poll_wakeup(struct ifnet *ifp)
4851 {
4852 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4853 
4854 	ifp->if_poll_req++;
4855 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4856 	    ifp->if_poll_thread != THREAD_NULL) {
4857 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4858 	}
4859 }
4860 
4861 void
ifnet_poll(struct ifnet * ifp)4862 ifnet_poll(struct ifnet *ifp)
4863 {
4864 	/*
4865 	 * If the poller thread is inactive, signal it to do work.
4866 	 */
4867 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4868 	ifnet_poll_wakeup(ifp);
4869 	lck_mtx_unlock(&ifp->if_poll_lock);
4870 }
4871 
4872 __attribute__((noreturn))
4873 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4874 ifnet_poll_thread_func(void *v, wait_result_t w)
4875 {
4876 #pragma unused(w)
4877 	char thread_name[MAXTHREADNAMESIZE];
4878 	struct ifnet *ifp = v;
4879 
4880 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4881 	VERIFY(current_thread() == ifp->if_poll_thread);
4882 
4883 	/* construct the name for this thread, and then apply it */
4884 	bzero(thread_name, sizeof(thread_name));
4885 	(void) snprintf(thread_name, sizeof(thread_name),
4886 	    "ifnet_poller_%s", ifp->if_xname);
4887 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4888 
4889 	lck_mtx_lock(&ifp->if_poll_lock);
4890 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4891 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4892 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4893 	/* wake up once to get out of embryonic state */
4894 	ifnet_poll_wakeup(ifp);
4895 	lck_mtx_unlock(&ifp->if_poll_lock);
4896 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4897 	/* NOTREACHED */
4898 	__builtin_unreachable();
4899 }
4900 
4901 __attribute__((noreturn))
4902 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4903 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4904 {
4905 	struct dlil_threading_info *inp;
4906 	struct ifnet *ifp = v;
4907 	struct ifnet_stat_increment_param s;
4908 	struct timespec start_time;
4909 
4910 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4911 
4912 	bzero(&s, sizeof(s));
4913 	net_timerclear(&start_time);
4914 
4915 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4916 	if (__improbable(wres == THREAD_INTERRUPTED ||
4917 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4918 		goto terminate;
4919 	}
4920 
4921 	inp = ifp->if_inp;
4922 	VERIFY(inp != NULL);
4923 
4924 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4925 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4926 		lck_mtx_unlock(&ifp->if_poll_lock);
4927 		ifnet_decr_pending_thread_count(ifp);
4928 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4929 		goto skip;
4930 	}
4931 
4932 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4933 
4934 	/*
4935 	 * Keep on servicing until no more request.
4936 	 */
4937 	for (;;) {
4938 		struct mbuf *m_head, *m_tail;
4939 		u_int32_t m_lim, m_cnt, m_totlen;
4940 		u_int16_t req = ifp->if_poll_req;
4941 
4942 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4943 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4944 		lck_mtx_unlock(&ifp->if_poll_lock);
4945 
4946 		/*
4947 		 * If no longer attached, there's nothing to do;
4948 		 * else hold an IO refcnt to prevent the interface
4949 		 * from being detached (will be released below.)
4950 		 */
4951 		if (!ifnet_is_attached(ifp, 1)) {
4952 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4953 			break;
4954 		}
4955 
4956 		if (dlil_verbose > 1) {
4957 			DLIL_PRINTF("%s: polling up to %d pkts, "
4958 			    "pkts avg %d max %d, wreq avg %d, "
4959 			    "bytes avg %d\n",
4960 			    if_name(ifp), m_lim,
4961 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4962 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4963 		}
4964 
4965 		/* invoke the driver's input poll routine */
4966 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4967 		&m_cnt, &m_totlen));
4968 
4969 		if (m_head != NULL) {
4970 			VERIFY(m_tail != NULL && m_cnt > 0);
4971 
4972 			if (dlil_verbose > 1) {
4973 				DLIL_PRINTF("%s: polled %d pkts, "
4974 				    "pkts avg %d max %d, wreq avg %d, "
4975 				    "bytes avg %d\n",
4976 				    if_name(ifp), m_cnt,
4977 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4978 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4979 			}
4980 
4981 			/* stats are required for extended variant */
4982 			s.packets_in = m_cnt;
4983 			s.bytes_in = m_totlen;
4984 
4985 			(void) ifnet_input_common(ifp, m_head, m_tail,
4986 			    &s, TRUE, TRUE);
4987 		} else {
4988 			if (dlil_verbose > 1) {
4989 				DLIL_PRINTF("%s: no packets, "
4990 				    "pkts avg %d max %d, wreq avg %d, "
4991 				    "bytes avg %d\n",
4992 				    if_name(ifp), ifp->if_rxpoll_pavg,
4993 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4994 				    ifp->if_rxpoll_bavg);
4995 			}
4996 
4997 			(void) ifnet_input_common(ifp, NULL, NULL,
4998 			    NULL, FALSE, TRUE);
4999 		}
5000 
5001 		/* Release the io ref count */
5002 		ifnet_decr_iorefcnt(ifp);
5003 
5004 		lck_mtx_lock_spin(&ifp->if_poll_lock);
5005 
5006 		/* if there's no pending request, we're done */
5007 		if (req == ifp->if_poll_req ||
5008 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5009 			break;
5010 		}
5011 	}
5012 skip:
5013 	ifp->if_poll_req = 0;
5014 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5015 
5016 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5017 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5018 		struct timespec *ts;
5019 
5020 		/*
5021 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5022 		 * until ifnet_poll() is called again.
5023 		 */
5024 		ts = &ifp->if_poll_cycle;
5025 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5026 			ts = NULL;
5027 		}
5028 
5029 		if (ts != NULL) {
5030 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5031 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5032 		}
5033 
5034 		(void) assert_wait_deadline(&ifp->if_poll_thread,
5035 		    THREAD_UNINT, deadline);
5036 		lck_mtx_unlock(&ifp->if_poll_lock);
5037 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5038 		/* NOTREACHED */
5039 	} else {
5040 terminate:
5041 		/* interface is detached (maybe while asleep)? */
5042 		ifnet_set_poll_cycle(ifp, NULL);
5043 
5044 		/* clear if_poll_thread to allow termination to continue */
5045 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
5046 		ifp->if_poll_thread = THREAD_NULL;
5047 		wakeup((caddr_t)&ifp->if_poll_thread);
5048 		lck_mtx_unlock(&ifp->if_poll_lock);
5049 
5050 		if (dlil_verbose) {
5051 			DLIL_PRINTF("%s: poller thread terminated\n",
5052 			    if_name(ifp));
5053 		}
5054 
5055 		/* for the extra refcnt from kernel_thread_start() */
5056 		thread_deallocate(current_thread());
5057 		/* this is the end */
5058 		thread_terminate(current_thread());
5059 		/* NOTREACHED */
5060 	}
5061 
5062 	/* must never get here */
5063 	VERIFY(0);
5064 	/* NOTREACHED */
5065 	__builtin_unreachable();
5066 }
5067 
5068 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5069 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5070 {
5071 	if (ts == NULL) {
5072 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5073 	} else {
5074 		*(&ifp->if_poll_cycle) = *ts;
5075 	}
5076 
5077 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5078 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5079 		    if_name(ifp), ts->tv_nsec);
5080 	}
5081 }
5082 
5083 void
ifnet_purge(struct ifnet * ifp)5084 ifnet_purge(struct ifnet *ifp)
5085 {
5086 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5087 		if_qflush_snd(ifp, false);
5088 	}
5089 }
5090 
5091 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5092 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5093 {
5094 	IFCQ_LOCK_ASSERT_HELD(ifq);
5095 
5096 	if (!(IFCQ_IS_READY(ifq))) {
5097 		return;
5098 	}
5099 
5100 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
5101 		struct tb_profile tb = {
5102 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
5103 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5104 		};
5105 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
5106 	}
5107 
5108 	ifclassq_update(ifq, ev);
5109 }
5110 
5111 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5112 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5113 {
5114 	switch (ev) {
5115 	case CLASSQ_EV_LINK_BANDWIDTH:
5116 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5117 			ifp->if_poll_update++;
5118 		}
5119 		break;
5120 
5121 	default:
5122 		break;
5123 	}
5124 }
5125 
5126 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5127 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5128 {
5129 	struct ifclassq *ifq;
5130 	u_int32_t omodel;
5131 	errno_t err;
5132 
5133 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5134 		return EINVAL;
5135 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5136 		return ENXIO;
5137 	}
5138 
5139 	ifq = ifp->if_snd;
5140 	IFCQ_LOCK(ifq);
5141 	omodel = ifp->if_output_sched_model;
5142 	ifp->if_output_sched_model = model;
5143 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5144 		ifp->if_output_sched_model = omodel;
5145 	}
5146 	IFCQ_UNLOCK(ifq);
5147 
5148 	return err;
5149 }
5150 
5151 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5152 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5153 {
5154 	if (ifp == NULL) {
5155 		return EINVAL;
5156 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5157 		return ENXIO;
5158 	}
5159 
5160 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5161 
5162 	return 0;
5163 }
5164 
5165 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5166 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5167 {
5168 	if (ifp == NULL || maxqlen == NULL) {
5169 		return EINVAL;
5170 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5171 		return ENXIO;
5172 	}
5173 
5174 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5175 
5176 	return 0;
5177 }
5178 
5179 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5180 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5181 {
5182 	errno_t err;
5183 
5184 	if (ifp == NULL || pkts == NULL) {
5185 		err = EINVAL;
5186 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5187 		err = ENXIO;
5188 	} else {
5189 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5190 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
5191 	}
5192 
5193 	return err;
5194 }
5195 
5196 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5197 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5198     u_int32_t *pkts, u_int32_t *bytes)
5199 {
5200 	errno_t err;
5201 
5202 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5203 	    (pkts == NULL && bytes == NULL)) {
5204 		err = EINVAL;
5205 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5206 		err = ENXIO;
5207 	} else {
5208 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5209 		    pkts, bytes);
5210 	}
5211 
5212 	return err;
5213 }
5214 
5215 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5216 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5217 {
5218 	struct dlil_threading_info *inp;
5219 
5220 	if (ifp == NULL) {
5221 		return EINVAL;
5222 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5223 		return ENXIO;
5224 	}
5225 
5226 	if (maxqlen == 0) {
5227 		maxqlen = if_rcvq_maxlen;
5228 	} else if (maxqlen < IF_RCVQ_MINLEN) {
5229 		maxqlen = IF_RCVQ_MINLEN;
5230 	}
5231 
5232 	inp = ifp->if_inp;
5233 	lck_mtx_lock(&inp->dlth_lock);
5234 	qlimit(&inp->dlth_pkts) = maxqlen;
5235 	lck_mtx_unlock(&inp->dlth_lock);
5236 
5237 	return 0;
5238 }
5239 
5240 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5241 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5242 {
5243 	struct dlil_threading_info *inp;
5244 
5245 	if (ifp == NULL || maxqlen == NULL) {
5246 		return EINVAL;
5247 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5248 		return ENXIO;
5249 	}
5250 
5251 	inp = ifp->if_inp;
5252 	lck_mtx_lock(&inp->dlth_lock);
5253 	*maxqlen = qlimit(&inp->dlth_pkts);
5254 	lck_mtx_unlock(&inp->dlth_lock);
5255 	return 0;
5256 }
5257 
5258 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5259 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5260     uint16_t delay_timeout)
5261 {
5262 	if (delay_qlen > 0 && delay_timeout > 0) {
5263 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5264 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5265 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
5266 		/* convert timeout to nanoseconds */
5267 		ifp->if_start_delay_timeout *= 1000;
5268 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5269 		    ifp->if_xname, (uint32_t)delay_qlen,
5270 		    (uint32_t)delay_timeout);
5271 	} else {
5272 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5273 	}
5274 }
5275 
5276 /*
5277  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5278  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5279  * buf holds the full header.
5280  */
5281 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5282 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5283 {
5284 	struct ip *ip;
5285 	struct ip6_hdr *ip6;
5286 	uint8_t lbuf[64] __attribute__((aligned(8)));
5287 	uint8_t *p = buf;
5288 
5289 	if (ip_ver == IPVERSION) {
5290 		uint8_t old_tos;
5291 		uint32_t sum;
5292 
5293 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5294 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5295 			bcopy(buf, lbuf, sizeof(struct ip));
5296 			p = lbuf;
5297 		}
5298 		ip = (struct ip *)(void *)p;
5299 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5300 			return;
5301 		}
5302 
5303 		DTRACE_IP1(clear__v4, struct ip *, ip);
5304 		old_tos = ip->ip_tos;
5305 		ip->ip_tos &= IPTOS_ECN_MASK;
5306 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5307 		sum = (sum >> 16) + (sum & 0xffff);
5308 		ip->ip_sum = (uint16_t)(sum & 0xffff);
5309 
5310 		if (__improbable(p == lbuf)) {
5311 			bcopy(lbuf, buf, sizeof(struct ip));
5312 		}
5313 	} else {
5314 		uint32_t flow;
5315 		ASSERT(ip_ver == IPV6_VERSION);
5316 
5317 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5318 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5319 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5320 			p = lbuf;
5321 		}
5322 		ip6 = (struct ip6_hdr *)(void *)p;
5323 		flow = ntohl(ip6->ip6_flow);
5324 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5325 			return;
5326 		}
5327 
5328 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5329 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5330 
5331 		if (__improbable(p == lbuf)) {
5332 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5333 		}
5334 	}
5335 }
5336 
5337 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5338 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5339     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5340 {
5341 #if SKYWALK
5342 	volatile struct sk_nexusadv *nxadv = NULL;
5343 #endif /* SKYWALK */
5344 	volatile uint64_t *fg_ts = NULL;
5345 	volatile uint64_t *rt_ts = NULL;
5346 	struct timespec now;
5347 	u_int64_t now_nsec = 0;
5348 	int error = 0;
5349 	uint8_t *mcast_buf = NULL;
5350 	uint8_t ip_ver;
5351 	uint32_t pktlen;
5352 
5353 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
5354 #if SKYWALK
5355 	/*
5356 	 * If attached to flowswitch, grab pointers to the
5357 	 * timestamp variables in the nexus advisory region.
5358 	 */
5359 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5360 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5361 		fg_ts = &nxadv->nxadv_fg_sendts;
5362 		rt_ts = &nxadv->nxadv_rt_sendts;
5363 	}
5364 #endif /* SKYWALK */
5365 
5366 	/*
5367 	 * If packet already carries a timestamp, either from dlil_output()
5368 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
5369 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5370 	 * the timestamp value is used internally there.
5371 	 */
5372 	switch (p->cp_ptype) {
5373 	case QP_MBUF:
5374 #if SKYWALK
5375 		/*
5376 		 * Valid only for non-native (compat) Skywalk interface.
5377 		 * If the data source uses packet, caller must convert
5378 		 * it to mbuf first prior to calling this routine.
5379 		 */
5380 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5381 #endif /* SKYWALK */
5382 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5383 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5384 
5385 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5386 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5387 			nanouptime(&now);
5388 			net_timernsec(&now, &now_nsec);
5389 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5390 		}
5391 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5392 		/*
5393 		 * If the packet service class is not background,
5394 		 * update the timestamp to indicate recent activity
5395 		 * on a foreground socket.
5396 		 */
5397 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5398 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5399 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5400 			    PKTF_SO_BACKGROUND)) {
5401 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5402 				if (fg_ts != NULL) {
5403 					*fg_ts = (uint32_t)_net_uptime;
5404 				}
5405 			}
5406 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5407 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5408 				if (rt_ts != NULL) {
5409 					*rt_ts = (uint32_t)_net_uptime;
5410 				}
5411 			}
5412 		}
5413 		pktlen = m_pktlen(p->cp_mbuf);
5414 
5415 		/*
5416 		 * Some Wi-Fi AP implementations do not correctly handle
5417 		 * multicast IP packets with DSCP bits set (radr://9331522).
5418 		 * As a workaround we clear the DSCP bits but keep service
5419 		 * class (rdar://51507725).
5420 		 */
5421 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5422 		    IFNET_IS_WIFI_INFRA(ifp)) {
5423 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5424 			struct ether_header *eh;
5425 			boolean_t pullup = FALSE;
5426 			uint16_t etype;
5427 
5428 			if (__improbable(len < sizeof(struct ether_header))) {
5429 				DTRACE_IP1(small__ether, size_t, len);
5430 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5431 				    sizeof(struct ether_header))) == NULL) {
5432 					return ENOMEM;
5433 				}
5434 			}
5435 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5436 			etype = ntohs(eh->ether_type);
5437 			if (etype == ETHERTYPE_IP) {
5438 				hlen = sizeof(struct ether_header) +
5439 				    sizeof(struct ip);
5440 				if (len < hlen) {
5441 					DTRACE_IP1(small__v4, size_t, len);
5442 					pullup = TRUE;
5443 				}
5444 				ip_ver = IPVERSION;
5445 			} else if (etype == ETHERTYPE_IPV6) {
5446 				hlen = sizeof(struct ether_header) +
5447 				    sizeof(struct ip6_hdr);
5448 				if (len < hlen) {
5449 					DTRACE_IP1(small__v6, size_t, len);
5450 					pullup = TRUE;
5451 				}
5452 				ip_ver = IPV6_VERSION;
5453 			} else {
5454 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5455 				break;
5456 			}
5457 			if (pullup) {
5458 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5459 				    NULL) {
5460 					return ENOMEM;
5461 				}
5462 
5463 				eh = (struct ether_header *)mbuf_data(
5464 					p->cp_mbuf);
5465 			}
5466 			mcast_buf = (uint8_t *)(eh + 1);
5467 			/*
5468 			 * ifnet_mcast_clear_dscp() will finish the work below.
5469 			 * Note that the pullups above ensure that mcast_buf
5470 			 * points to a full IP header.
5471 			 */
5472 		}
5473 		break;
5474 
5475 #if SKYWALK
5476 	case QP_PACKET:
5477 		/*
5478 		 * Valid only for native Skywalk interface.  If the data
5479 		 * source uses mbuf, caller must convert it to packet first
5480 		 * prior to calling this routine.
5481 		 */
5482 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5483 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5484 		    p->cp_kpkt->pkt_timestamp == 0) {
5485 			nanouptime(&now);
5486 			net_timernsec(&now, &now_nsec);
5487 			p->cp_kpkt->pkt_timestamp = now_nsec;
5488 		}
5489 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5490 		/*
5491 		 * If the packet service class is not background,
5492 		 * update the timestamps on the interface, as well as
5493 		 * the ones in nexus-wide advisory to indicate recent
5494 		 * activity on a foreground flow.
5495 		 */
5496 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5497 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5498 			if (fg_ts != NULL) {
5499 				*fg_ts = (uint32_t)_net_uptime;
5500 			}
5501 		}
5502 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5503 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5504 			if (rt_ts != NULL) {
5505 				*rt_ts = (uint32_t)_net_uptime;
5506 			}
5507 		}
5508 		pktlen = p->cp_kpkt->pkt_length;
5509 
5510 		/*
5511 		 * Some Wi-Fi AP implementations do not correctly handle
5512 		 * multicast IP packets with DSCP bits set (radr://9331522).
5513 		 * As a workaround we clear the DSCP bits but keep service
5514 		 * class (rdar://51507725).
5515 		 */
5516 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5517 		    IFNET_IS_WIFI_INFRA(ifp)) {
5518 			uint8_t *baddr;
5519 			struct ether_header *eh;
5520 			uint16_t etype;
5521 
5522 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5523 			baddr += p->cp_kpkt->pkt_headroom;
5524 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5525 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5526 				    p->cp_kpkt);
5527 				break;
5528 			}
5529 			eh = (struct ether_header *)(void *)baddr;
5530 			etype = ntohs(eh->ether_type);
5531 			if (etype == ETHERTYPE_IP) {
5532 				if (pktlen < sizeof(struct ether_header) +
5533 				    sizeof(struct ip)) {
5534 					DTRACE_IP1(pkt__small__v4, uint32_t,
5535 					    pktlen);
5536 					break;
5537 				}
5538 				ip_ver = IPVERSION;
5539 			} else if (etype == ETHERTYPE_IPV6) {
5540 				if (pktlen < sizeof(struct ether_header) +
5541 				    sizeof(struct ip6_hdr)) {
5542 					DTRACE_IP1(pkt__small__v6, uint32_t,
5543 					    pktlen);
5544 					break;
5545 				}
5546 				ip_ver = IPV6_VERSION;
5547 			} else {
5548 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5549 				    etype);
5550 				break;
5551 			}
5552 			mcast_buf = (uint8_t *)(eh + 1);
5553 			/*
5554 			 * ifnet_mcast_clear_dscp() will finish the work below.
5555 			 * The checks above verify that the IP header is in the
5556 			 * first buflet.
5557 			 */
5558 		}
5559 		break;
5560 #endif /* SKYWALK */
5561 
5562 	default:
5563 		VERIFY(0);
5564 		/* NOTREACHED */
5565 		__builtin_unreachable();
5566 	}
5567 
5568 	if (mcast_buf != NULL) {
5569 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5570 	}
5571 
5572 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5573 		if (now_nsec == 0) {
5574 			nanouptime(&now);
5575 			net_timernsec(&now, &now_nsec);
5576 		}
5577 		/*
5578 		 * If the driver chose to delay start callback for
5579 		 * coalescing multiple packets, Then use the following
5580 		 * heuristics to make sure that start callback will
5581 		 * be delayed only when bulk data transfer is detected.
5582 		 * 1. number of packets enqueued in (delay_win * 2) is
5583 		 * greater than or equal to the delay qlen.
5584 		 * 2. If delay_start is enabled it will stay enabled for
5585 		 * another 10 idle windows. This is to take into account
5586 		 * variable RTT and burst traffic.
5587 		 * 3. If the time elapsed since last enqueue is more
5588 		 * than 200ms we disable delaying start callback. This is
5589 		 * is to take idle time into account.
5590 		 */
5591 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5592 		if (ifp->if_start_delay_swin > 0) {
5593 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5594 				ifp->if_start_delay_cnt++;
5595 			} else if ((now_nsec - ifp->if_start_delay_swin)
5596 			    >= (200 * 1000 * 1000)) {
5597 				ifp->if_start_delay_swin = now_nsec;
5598 				ifp->if_start_delay_cnt = 1;
5599 				ifp->if_start_delay_idle = 0;
5600 				if (ifp->if_eflags & IFEF_DELAY_START) {
5601 					if_clear_eflags(ifp, IFEF_DELAY_START);
5602 					ifnet_delay_start_disabled_increment();
5603 				}
5604 			} else {
5605 				if (ifp->if_start_delay_cnt >=
5606 				    ifp->if_start_delay_qlen) {
5607 					if_set_eflags(ifp, IFEF_DELAY_START);
5608 					ifp->if_start_delay_idle = 0;
5609 				} else {
5610 					if (ifp->if_start_delay_idle >= 10) {
5611 						if_clear_eflags(ifp,
5612 						    IFEF_DELAY_START);
5613 						ifnet_delay_start_disabled_increment();
5614 					} else {
5615 						ifp->if_start_delay_idle++;
5616 					}
5617 				}
5618 				ifp->if_start_delay_swin = now_nsec;
5619 				ifp->if_start_delay_cnt = 1;
5620 			}
5621 		} else {
5622 			ifp->if_start_delay_swin = now_nsec;
5623 			ifp->if_start_delay_cnt = 1;
5624 			ifp->if_start_delay_idle = 0;
5625 			if_clear_eflags(ifp, IFEF_DELAY_START);
5626 		}
5627 	} else {
5628 		if_clear_eflags(ifp, IFEF_DELAY_START);
5629 	}
5630 
5631 	/* enqueue the packet (caller consumes object) */
5632 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5633 	    1, pktlen, pdrop);
5634 
5635 	/*
5636 	 * Tell the driver to start dequeueing; do this even when the queue
5637 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5638 	 * be dequeueing from other unsuspended queues.
5639 	 */
5640 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5641 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5642 		ifnet_start(ifp);
5643 	}
5644 
5645 	return error;
5646 }
5647 
5648 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5649 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5650     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5651     boolean_t flush, boolean_t *pdrop)
5652 {
5653 	int error;
5654 
5655 	/* enqueue the packet (caller consumes object) */
5656 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5657 	    cnt, bytes, pdrop);
5658 
5659 	/*
5660 	 * Tell the driver to start dequeueing; do this even when the queue
5661 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5662 	 * be dequeueing from other unsuspended queues.
5663 	 */
5664 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5665 		ifnet_start(ifp);
5666 	}
5667 	return error;
5668 }
5669 
5670 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5671 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5672 {
5673 	struct ifnet *ifp = handle;
5674 	boolean_t pdrop;        /* dummy */
5675 	uint32_t i;
5676 
5677 	ASSERT(n_pkts >= 1);
5678 	for (i = 0; i < n_pkts - 1; i++) {
5679 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5680 		    FALSE, &pdrop);
5681 	}
5682 	/* flush with the last packet */
5683 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5684 	    TRUE, &pdrop);
5685 
5686 	return 0;
5687 }
5688 
5689 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5690 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5691     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5692 {
5693 	if (ifp->if_output_netem != NULL) {
5694 		bool drop;
5695 		errno_t error;
5696 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5697 		*pdrop = drop ? TRUE : FALSE;
5698 		return error;
5699 	} else {
5700 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5701 	}
5702 }
5703 
5704 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5705 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5706 {
5707 	uint32_t bytes = m_pktlen(m);
5708 	struct mbuf *tail = m;
5709 	uint32_t cnt = 1;
5710 	boolean_t pdrop;
5711 
5712 	while (tail->m_nextpkt) {
5713 		VERIFY(tail->m_flags & M_PKTHDR);
5714 		tail = tail->m_nextpkt;
5715 		cnt++;
5716 		bytes += m_pktlen(tail);
5717 	}
5718 
5719 	return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5720 }
5721 
5722 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5723 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5724     boolean_t *pdrop)
5725 {
5726 	classq_pkt_t pkt;
5727 
5728 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5729 	    m->m_nextpkt != NULL) {
5730 		if (m != NULL) {
5731 			m_freem_list(m);
5732 			*pdrop = TRUE;
5733 		}
5734 		return EINVAL;
5735 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5736 	    !IF_FULLY_ATTACHED(ifp)) {
5737 		/* flag tested without lock for performance */
5738 		m_freem(m);
5739 		*pdrop = TRUE;
5740 		return ENXIO;
5741 	} else if (!(ifp->if_flags & IFF_UP)) {
5742 		m_freem(m);
5743 		*pdrop = TRUE;
5744 		return ENETDOWN;
5745 	}
5746 
5747 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5748 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5749 }
5750 
5751 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5752 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5753     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5754     boolean_t *pdrop)
5755 {
5756 	classq_pkt_t head, tail;
5757 
5758 	ASSERT(m_head != NULL);
5759 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5760 	ASSERT(m_tail != NULL);
5761 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5762 	ASSERT(ifp != NULL);
5763 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5764 
5765 	if (!IF_FULLY_ATTACHED(ifp)) {
5766 		/* flag tested without lock for performance */
5767 		m_freem_list(m_head);
5768 		*pdrop = TRUE;
5769 		return ENXIO;
5770 	} else if (!(ifp->if_flags & IFF_UP)) {
5771 		m_freem_list(m_head);
5772 		*pdrop = TRUE;
5773 		return ENETDOWN;
5774 	}
5775 
5776 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5777 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5778 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5779 	           flush, pdrop);
5780 }
5781 
5782 #if SKYWALK
5783 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5784 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5785     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5786 {
5787 	classq_pkt_t pkt;
5788 
5789 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5790 
5791 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5792 		if (kpkt != NULL) {
5793 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5794 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5795 			*pdrop = TRUE;
5796 		}
5797 		return EINVAL;
5798 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5799 	    !IF_FULLY_ATTACHED(ifp))) {
5800 		/* flag tested without lock for performance */
5801 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5802 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5803 		*pdrop = TRUE;
5804 		return ENXIO;
5805 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5806 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5807 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5808 		*pdrop = TRUE;
5809 		return ENETDOWN;
5810 	}
5811 
5812 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5813 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5814 }
5815 
5816 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5817 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5818     boolean_t flush, boolean_t *pdrop)
5819 {
5820 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5821 }
5822 
5823 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5824 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5825     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5826 {
5827 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5828 }
5829 
5830 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5831 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5832     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5833     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5834 {
5835 	classq_pkt_t head, tail;
5836 
5837 	ASSERT(k_head != NULL);
5838 	ASSERT(k_tail != NULL);
5839 	ASSERT(ifp != NULL);
5840 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5841 
5842 	if (!IF_FULLY_ATTACHED(ifp)) {
5843 		/* flag tested without lock for performance */
5844 		pp_free_packet_chain(k_head, NULL);
5845 		*pdrop = TRUE;
5846 		return ENXIO;
5847 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5848 		pp_free_packet_chain(k_head, NULL);
5849 		*pdrop = TRUE;
5850 		return ENETDOWN;
5851 	}
5852 
5853 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5854 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5855 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5856 	           flush, pdrop);
5857 }
5858 
5859 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5860 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5861     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5862     boolean_t *pdrop)
5863 {
5864 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5865 	           cnt, bytes, flush, pdrop);
5866 }
5867 
5868 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5869 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5870     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5871     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5872 {
5873 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5874 	           cnt, bytes, flush, pdrop);
5875 }
5876 #endif /* SKYWALK */
5877 
5878 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5879 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5880 {
5881 	errno_t rc;
5882 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5883 
5884 	if (ifp == NULL || mp == NULL) {
5885 		return EINVAL;
5886 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5887 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5888 		return ENXIO;
5889 	}
5890 	if (!ifnet_is_attached(ifp, 1)) {
5891 		return ENXIO;
5892 	}
5893 
5894 #if SKYWALK
5895 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5896 #endif /* SKYWALK */
5897 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5898 	    &pkt, NULL, NULL, NULL, 0);
5899 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5900 	ifnet_decr_iorefcnt(ifp);
5901 	*mp = pkt.cp_mbuf;
5902 	return rc;
5903 }
5904 
5905 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5906 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5907     struct mbuf **mp)
5908 {
5909 	errno_t rc;
5910 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5911 
5912 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5913 		return EINVAL;
5914 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5915 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5916 		return ENXIO;
5917 	}
5918 	if (!ifnet_is_attached(ifp, 1)) {
5919 		return ENXIO;
5920 	}
5921 
5922 #if SKYWALK
5923 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5924 #endif /* SKYWALK */
5925 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5926 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5927 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5928 	ifnet_decr_iorefcnt(ifp);
5929 	*mp = pkt.cp_mbuf;
5930 	return rc;
5931 }
5932 
5933 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5934 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5935     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5936 {
5937 	errno_t rc;
5938 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5939 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5940 
5941 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5942 		return EINVAL;
5943 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5944 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5945 		return ENXIO;
5946 	}
5947 	if (!ifnet_is_attached(ifp, 1)) {
5948 		return ENXIO;
5949 	}
5950 
5951 #if SKYWALK
5952 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5953 #endif /* SKYWALK */
5954 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5955 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5956 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5957 	ifnet_decr_iorefcnt(ifp);
5958 	*head = pkt_head.cp_mbuf;
5959 	if (tail != NULL) {
5960 		*tail = pkt_tail.cp_mbuf;
5961 	}
5962 	return rc;
5963 }
5964 
5965 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5966 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5967     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5968 {
5969 	errno_t rc;
5970 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5971 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5972 
5973 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5974 		return EINVAL;
5975 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5976 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5977 		return ENXIO;
5978 	}
5979 	if (!ifnet_is_attached(ifp, 1)) {
5980 		return ENXIO;
5981 	}
5982 
5983 #if SKYWALK
5984 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5985 #endif /* SKYWALK */
5986 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5987 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5988 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5989 	ifnet_decr_iorefcnt(ifp);
5990 	*head = pkt_head.cp_mbuf;
5991 	if (tail != NULL) {
5992 		*tail = pkt_tail.cp_mbuf;
5993 	}
5994 	return rc;
5995 }
5996 
5997 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5998 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5999     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
6000     u_int32_t *len)
6001 {
6002 	errno_t rc;
6003 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
6004 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
6005 
6006 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
6007 	    !MBUF_VALID_SC(sc)) {
6008 		return EINVAL;
6009 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6010 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6011 		return ENXIO;
6012 	}
6013 	if (!ifnet_is_attached(ifp, 1)) {
6014 		return ENXIO;
6015 	}
6016 
6017 #if SKYWALK
6018 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6019 #endif /* SKYWALK */
6020 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6021 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6022 	    cnt, len, 0);
6023 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6024 	ifnet_decr_iorefcnt(ifp);
6025 	*head = pkt_head.cp_mbuf;
6026 	if (tail != NULL) {
6027 		*tail = pkt_tail.cp_mbuf;
6028 	}
6029 	return rc;
6030 }
6031 
6032 #if XNU_TARGET_OS_OSX
6033 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6034 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6035     const struct sockaddr *dest, const char *dest_linkaddr,
6036     const char *frame_type, u_int32_t *pre, u_int32_t *post)
6037 {
6038 	if (pre != NULL) {
6039 		*pre = 0;
6040 	}
6041 	if (post != NULL) {
6042 		*post = 0;
6043 	}
6044 
6045 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6046 }
6047 #endif /* XNU_TARGET_OS_OSX */
6048 
6049 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6050 packet_has_vlan_tag(struct mbuf * m)
6051 {
6052 	u_int   tag = 0;
6053 
6054 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6055 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6056 		if (tag == 0) {
6057 			/* the packet is just priority-tagged, clear the bit */
6058 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6059 		}
6060 	}
6061 	return tag != 0;
6062 }
6063 
6064 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6065 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6066     char **frame_header_p, protocol_family_t protocol_family)
6067 {
6068 	boolean_t               is_vlan_packet = FALSE;
6069 	struct ifnet_filter     *filter;
6070 	struct mbuf             *m = *m_p;
6071 
6072 	is_vlan_packet = packet_has_vlan_tag(m);
6073 
6074 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6075 		return 0;
6076 	}
6077 
6078 	/*
6079 	 * Pass the inbound packet to the interface filters
6080 	 */
6081 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6082 	/* prevent filter list from changing in case we drop the lock */
6083 	if_flt_monitor_busy(ifp);
6084 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6085 		int result;
6086 
6087 		/* exclude VLAN packets from external filters PR-3586856 */
6088 		if (is_vlan_packet &&
6089 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6090 			continue;
6091 		}
6092 
6093 		if (!filter->filt_skip && filter->filt_input != NULL &&
6094 		    (filter->filt_protocol == 0 ||
6095 		    filter->filt_protocol == protocol_family)) {
6096 			lck_mtx_unlock(&ifp->if_flt_lock);
6097 
6098 			result = (*filter->filt_input)(filter->filt_cookie,
6099 			    ifp, protocol_family, m_p, frame_header_p);
6100 
6101 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6102 			if (result != 0) {
6103 				/* we're done with the filter list */
6104 				if_flt_monitor_unbusy(ifp);
6105 				lck_mtx_unlock(&ifp->if_flt_lock);
6106 				return result;
6107 			}
6108 		}
6109 	}
6110 	/* we're done with the filter list */
6111 	if_flt_monitor_unbusy(ifp);
6112 	lck_mtx_unlock(&ifp->if_flt_lock);
6113 
6114 	/*
6115 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6116 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6117 	 */
6118 	if (*m_p != NULL) {
6119 		(*m_p)->m_flags &= ~M_PROTO1;
6120 	}
6121 
6122 	return 0;
6123 }
6124 
6125 __attribute__((noinline))
6126 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6127 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6128     protocol_family_t protocol_family)
6129 {
6130 	boolean_t               is_vlan_packet;
6131 	struct ifnet_filter     *filter;
6132 	struct mbuf             *m = *m_p;
6133 
6134 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6135 		return 0;
6136 	}
6137 	is_vlan_packet = packet_has_vlan_tag(m);
6138 
6139 	/*
6140 	 * Pass the outbound packet to the interface filters
6141 	 */
6142 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6143 	/* prevent filter list from changing in case we drop the lock */
6144 	if_flt_monitor_busy(ifp);
6145 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6146 		int result;
6147 
6148 		/* exclude VLAN packets from external filters PR-3586856 */
6149 		if (is_vlan_packet &&
6150 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6151 			continue;
6152 		}
6153 
6154 		if (!filter->filt_skip && filter->filt_output != NULL &&
6155 		    (filter->filt_protocol == 0 ||
6156 		    filter->filt_protocol == protocol_family)) {
6157 			lck_mtx_unlock(&ifp->if_flt_lock);
6158 
6159 			result = filter->filt_output(filter->filt_cookie, ifp,
6160 			    protocol_family, m_p);
6161 
6162 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6163 			if (result != 0) {
6164 				/* we're done with the filter list */
6165 				if_flt_monitor_unbusy(ifp);
6166 				lck_mtx_unlock(&ifp->if_flt_lock);
6167 				return result;
6168 			}
6169 		}
6170 	}
6171 	/* we're done with the filter list */
6172 	if_flt_monitor_unbusy(ifp);
6173 	lck_mtx_unlock(&ifp->if_flt_lock);
6174 
6175 	return 0;
6176 }
6177 
6178 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6179 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6180 {
6181 	int error;
6182 
6183 	if (ifproto->proto_kpi == kProtoKPI_v1) {
6184 		/* Version 1 protocols get one packet at a time */
6185 		while (m != NULL) {
6186 			char *  frame_header;
6187 			mbuf_t  next_packet;
6188 
6189 			next_packet = m->m_nextpkt;
6190 			m->m_nextpkt = NULL;
6191 			frame_header = m->m_pkthdr.pkt_hdr;
6192 			m->m_pkthdr.pkt_hdr = NULL;
6193 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6194 			    ifproto->protocol_family, m, frame_header);
6195 			if (error != 0 && error != EJUSTRETURN) {
6196 				m_freem(m);
6197 			}
6198 			m = next_packet;
6199 		}
6200 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
6201 		/* Version 2 protocols support packet lists */
6202 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6203 		    ifproto->protocol_family, m);
6204 		if (error != 0 && error != EJUSTRETURN) {
6205 			m_freem_list(m);
6206 		}
6207 	}
6208 }
6209 
6210 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6211 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6212     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6213 {
6214 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6215 
6216 	if (s->packets_in != 0) {
6217 		d->packets_in += s->packets_in;
6218 	}
6219 	if (s->bytes_in != 0) {
6220 		d->bytes_in += s->bytes_in;
6221 	}
6222 	if (s->errors_in != 0) {
6223 		d->errors_in += s->errors_in;
6224 	}
6225 
6226 	if (s->packets_out != 0) {
6227 		d->packets_out += s->packets_out;
6228 	}
6229 	if (s->bytes_out != 0) {
6230 		d->bytes_out += s->bytes_out;
6231 	}
6232 	if (s->errors_out != 0) {
6233 		d->errors_out += s->errors_out;
6234 	}
6235 
6236 	if (s->collisions != 0) {
6237 		d->collisions += s->collisions;
6238 	}
6239 	if (s->dropped != 0) {
6240 		d->dropped += s->dropped;
6241 	}
6242 
6243 	if (poll) {
6244 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6245 	}
6246 }
6247 
6248 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6249 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6250 {
6251 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6252 
6253 	/*
6254 	 * Use of atomic operations is unavoidable here because
6255 	 * these stats may also be incremented elsewhere via KPIs.
6256 	 */
6257 	if (s->packets_in != 0) {
6258 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6259 		s->packets_in = 0;
6260 	}
6261 	if (s->bytes_in != 0) {
6262 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6263 		s->bytes_in = 0;
6264 	}
6265 	if (s->errors_in != 0) {
6266 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6267 		s->errors_in = 0;
6268 	}
6269 
6270 	if (s->packets_out != 0) {
6271 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6272 		s->packets_out = 0;
6273 	}
6274 	if (s->bytes_out != 0) {
6275 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6276 		s->bytes_out = 0;
6277 	}
6278 	if (s->errors_out != 0) {
6279 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6280 		s->errors_out = 0;
6281 	}
6282 
6283 	if (s->collisions != 0) {
6284 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6285 		s->collisions = 0;
6286 	}
6287 	if (s->dropped != 0) {
6288 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6289 		s->dropped = 0;
6290 	}
6291 
6292 	/*
6293 	 * No need for atomic operations as they are modified here
6294 	 * only from within the DLIL input thread context.
6295 	 */
6296 	if (ifp->if_poll_tstats.packets != 0) {
6297 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6298 		ifp->if_poll_tstats.packets = 0;
6299 	}
6300 	if (ifp->if_poll_tstats.bytes != 0) {
6301 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6302 		ifp->if_poll_tstats.bytes = 0;
6303 	}
6304 
6305 	return ifp->if_data_threshold != 0;
6306 }
6307 
6308 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6309 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6310 {
6311 	return dlil_input_packet_list_common(ifp, m, 0,
6312 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6313 }
6314 
6315 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6316 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6317     u_int32_t cnt, ifnet_model_t mode)
6318 {
6319 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6320 }
6321 
6322 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6323 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6324     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6325 {
6326 	int error = 0;
6327 	protocol_family_t protocol_family;
6328 	mbuf_t next_packet;
6329 	ifnet_t ifp = ifp_param;
6330 	char *frame_header = NULL;
6331 	struct if_proto *last_ifproto = NULL;
6332 	mbuf_t pkt_first = NULL;
6333 	mbuf_t *pkt_next = NULL;
6334 	u_int32_t poll_thresh = 0, poll_ival = 0;
6335 	int iorefcnt = 0;
6336 
6337 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6338 
6339 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6340 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6341 		poll_thresh = cnt;
6342 	}
6343 
6344 	while (m != NULL) {
6345 		struct if_proto *ifproto = NULL;
6346 		uint32_t pktf_mask;     /* pkt flags to preserve */
6347 
6348 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6349 
6350 		if (ifp_param == NULL) {
6351 			ifp = m->m_pkthdr.rcvif;
6352 		}
6353 
6354 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6355 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6356 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6357 			ifnet_poll(ifp);
6358 		}
6359 
6360 		/* Check if this mbuf looks valid */
6361 		MBUF_INPUT_CHECK(m, ifp);
6362 
6363 		next_packet = m->m_nextpkt;
6364 		m->m_nextpkt = NULL;
6365 		frame_header = m->m_pkthdr.pkt_hdr;
6366 		m->m_pkthdr.pkt_hdr = NULL;
6367 
6368 		/*
6369 		 * Get an IO reference count if the interface is not
6370 		 * loopback (lo0) and it is attached; lo0 never goes
6371 		 * away, so optimize for that.
6372 		 */
6373 		if (ifp != lo_ifp) {
6374 			/* iorefcnt is 0 if it hasn't been taken yet */
6375 			if (iorefcnt == 0) {
6376 				if (!ifnet_datamov_begin(ifp)) {
6377 					m_freem(m);
6378 					goto next;
6379 				}
6380 			}
6381 			iorefcnt = 1;
6382 			/*
6383 			 * Preserve the time stamp and skip pktap flags.
6384 			 */
6385 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6386 		} else {
6387 			/*
6388 			 * If this arrived on lo0, preserve interface addr
6389 			 * info to allow for connectivity between loopback
6390 			 * and local interface addresses.
6391 			 */
6392 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6393 		}
6394 		pktf_mask |= PKTF_WAKE_PKT;
6395 
6396 		/* make sure packet comes in clean */
6397 		m_classifier_init(m, pktf_mask);
6398 
6399 		ifp_inc_traffic_class_in(ifp, m);
6400 
6401 		/* find which protocol family this packet is for */
6402 		ifnet_lock_shared(ifp);
6403 		error = (*ifp->if_demux)(ifp, m, frame_header,
6404 		    &protocol_family);
6405 		ifnet_lock_done(ifp);
6406 		if (error != 0) {
6407 			if (error == EJUSTRETURN) {
6408 				goto next;
6409 			}
6410 			protocol_family = 0;
6411 		}
6412 
6413 #if (DEVELOPMENT || DEBUG)
6414 		/*
6415 		 * For testing we do not care about broadcast and multicast packets as
6416 		 * they are not as controllable as unicast traffic
6417 		 */
6418 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6419 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6420 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6421 				/*
6422 				 * This is a one-shot command
6423 				 */
6424 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6425 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6426 			}
6427 		}
6428 #endif /* (DEVELOPMENT || DEBUG) */
6429 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6430 			char buffer[64];
6431 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6432 
6433 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6434 			    ifp->if_xname, m_pktlen(m));
6435 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6436 				log_hexdump(buffer, buflen);
6437 			}
6438 		}
6439 
6440 		pktap_input(ifp, protocol_family, m, frame_header);
6441 
6442 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6443 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6444 		    ifp->if_type == IFT_CELLULAR) {
6445 			m_freem(m);
6446 			ip6stat.ip6s_clat464_in_v4_drop++;
6447 			goto next;
6448 		}
6449 
6450 		/* Translate the packet if it is received on CLAT interface */
6451 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6452 		    && dlil_is_clat_needed(protocol_family, m)) {
6453 			char *data = NULL;
6454 			struct ether_header eh;
6455 			struct ether_header *ehp = NULL;
6456 
6457 			if (ifp->if_type == IFT_ETHER) {
6458 				ehp = (struct ether_header *)(void *)frame_header;
6459 				/* Skip RX Ethernet packets if they are not IPV6 */
6460 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6461 					goto skip_clat;
6462 				}
6463 
6464 				/* Keep a copy of frame_header for Ethernet packets */
6465 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6466 			}
6467 			error = dlil_clat64(ifp, &protocol_family, &m);
6468 			data = (char *) mbuf_data(m);
6469 			if (error != 0) {
6470 				m_freem(m);
6471 				ip6stat.ip6s_clat464_in_drop++;
6472 				goto next;
6473 			}
6474 			/* Native v6 should be No-op */
6475 			if (protocol_family != PF_INET) {
6476 				goto skip_clat;
6477 			}
6478 
6479 			/* Do this only for translated v4 packets. */
6480 			switch (ifp->if_type) {
6481 			case IFT_CELLULAR:
6482 				frame_header = data;
6483 				break;
6484 			case IFT_ETHER:
6485 				/*
6486 				 * Drop if the mbuf doesn't have enough
6487 				 * space for Ethernet header
6488 				 */
6489 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6490 					m_free(m);
6491 					ip6stat.ip6s_clat464_in_drop++;
6492 					goto next;
6493 				}
6494 				/*
6495 				 * Set the frame_header ETHER_HDR_LEN bytes
6496 				 * preceeding the data pointer. Change
6497 				 * the ether_type too.
6498 				 */
6499 				frame_header = data - ETHER_HDR_LEN;
6500 				eh.ether_type = htons(ETHERTYPE_IP);
6501 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6502 				break;
6503 			}
6504 		}
6505 skip_clat:
6506 		/*
6507 		 * Match the wake packet against the list of ports that has been
6508 		 * been queried by the driver before the device went to sleep
6509 		 */
6510 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6511 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6512 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6513 			}
6514 		}
6515 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6516 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6517 			dlil_input_cksum_dbg(ifp, m, frame_header,
6518 			    protocol_family);
6519 		}
6520 		/*
6521 		 * For partial checksum offload, we expect the driver to
6522 		 * set the start offset indicating the start of the span
6523 		 * that is covered by the hardware-computed checksum;
6524 		 * adjust this start offset accordingly because the data
6525 		 * pointer has been advanced beyond the link-layer header.
6526 		 *
6527 		 * Virtual lan types (bridge, vlan, bond) can call
6528 		 * dlil_input_packet_list() with the same packet with the
6529 		 * checksum flags set. Set a flag indicating that the
6530 		 * adjustment has already been done.
6531 		 */
6532 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6533 			/* adjustment has already been done */
6534 		} else if ((m->m_pkthdr.csum_flags &
6535 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6536 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6537 			int adj;
6538 			if (frame_header == NULL ||
6539 			    frame_header < (char *)mbuf_datastart(m) ||
6540 			    frame_header > (char *)m->m_data ||
6541 			    (adj = (int)(m->m_data - frame_header)) >
6542 			    m->m_pkthdr.csum_rx_start) {
6543 				m->m_pkthdr.csum_data = 0;
6544 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6545 				hwcksum_in_invalidated++;
6546 			} else {
6547 				m->m_pkthdr.csum_rx_start -= adj;
6548 			}
6549 			/* make sure we don't adjust more than once */
6550 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6551 		}
6552 		if (clat_debug) {
6553 			pktap_input(ifp, protocol_family, m, frame_header);
6554 		}
6555 
6556 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6557 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6558 		}
6559 
6560 		/* run interface filters */
6561 		error = dlil_interface_filters_input(ifp, &m,
6562 		    &frame_header, protocol_family);
6563 		if (error != 0) {
6564 			if (error != EJUSTRETURN) {
6565 				m_freem(m);
6566 			}
6567 			goto next;
6568 		}
6569 		/*
6570 		 * A VLAN interface receives VLAN-tagged packets by attaching
6571 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6572 		 * interface is a member of a bridge, the parent interface
6573 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6574 		 * M_PROMISC packet must be processed by the VLAN protocol
6575 		 * so that it can be sent up the stack via
6576 		 * dlil_input_packet_list(). That allows the bridge interface's
6577 		 * input filter, attached to the VLAN interface, to process
6578 		 * the packet.
6579 		 */
6580 		if (protocol_family != PF_VLAN &&
6581 		    (m->m_flags & M_PROMISC) != 0) {
6582 			m_freem(m);
6583 			goto next;
6584 		}
6585 
6586 		/* Lookup the protocol attachment to this interface */
6587 		if (protocol_family == 0) {
6588 			ifproto = NULL;
6589 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6590 		    (last_ifproto->protocol_family == protocol_family)) {
6591 			VERIFY(ifproto == NULL);
6592 			ifproto = last_ifproto;
6593 			if_proto_ref(last_ifproto);
6594 		} else {
6595 			VERIFY(ifproto == NULL);
6596 			ifnet_lock_shared(ifp);
6597 			/* callee holds a proto refcnt upon success */
6598 			ifproto = find_attached_proto(ifp, protocol_family);
6599 			ifnet_lock_done(ifp);
6600 		}
6601 		if (ifproto == NULL) {
6602 			/* no protocol for this packet, discard */
6603 			m_freem(m);
6604 			goto next;
6605 		}
6606 		if (ifproto != last_ifproto) {
6607 			if (last_ifproto != NULL) {
6608 				/* pass up the list for the previous protocol */
6609 				dlil_ifproto_input(last_ifproto, pkt_first);
6610 				pkt_first = NULL;
6611 				if_proto_free(last_ifproto);
6612 			}
6613 			last_ifproto = ifproto;
6614 			if_proto_ref(ifproto);
6615 		}
6616 		/* extend the list */
6617 		m->m_pkthdr.pkt_hdr = frame_header;
6618 		if (pkt_first == NULL) {
6619 			pkt_first = m;
6620 		} else {
6621 			*pkt_next = m;
6622 		}
6623 		pkt_next = &m->m_nextpkt;
6624 
6625 next:
6626 		if (next_packet == NULL && last_ifproto != NULL) {
6627 			/* pass up the last list of packets */
6628 			dlil_ifproto_input(last_ifproto, pkt_first);
6629 			if_proto_free(last_ifproto);
6630 			last_ifproto = NULL;
6631 		}
6632 		if (ifproto != NULL) {
6633 			if_proto_free(ifproto);
6634 			ifproto = NULL;
6635 		}
6636 
6637 		m = next_packet;
6638 
6639 		/* update the driver's multicast filter, if needed */
6640 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6641 			ifp->if_updatemcasts = 0;
6642 		}
6643 		if (iorefcnt == 1) {
6644 			/* If the next mbuf is on a different interface, unlock data-mov */
6645 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6646 				ifnet_datamov_end(ifp);
6647 				iorefcnt = 0;
6648 			}
6649 		}
6650 	}
6651 
6652 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6653 }
6654 
6655 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6656 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6657 {
6658 	errno_t err;
6659 
6660 	if (sync) {
6661 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6662 		if (err == EAFNOSUPPORT) {
6663 			err = 0;
6664 		}
6665 	} else {
6666 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6667 		err = 0;
6668 	}
6669 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6670 	    "(err=%d)\n", if_name(ifp),
6671 	    (err == 0 ? "successfully restored" : "failed to restore"),
6672 	    ifp->if_updatemcasts, err);
6673 
6674 	/* just return success */
6675 	return 0;
6676 }
6677 
6678 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6679 if_mcasts_update_async(struct ifnet *ifp)
6680 {
6681 	return if_mcasts_update_common(ifp, false);
6682 }
6683 
6684 errno_t
if_mcasts_update(struct ifnet * ifp)6685 if_mcasts_update(struct ifnet *ifp)
6686 {
6687 	return if_mcasts_update_common(ifp, true);
6688 }
6689 
6690 /* If ifp is set, we will increment the generation for the interface */
6691 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6692 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6693 {
6694 	if (ifp != NULL) {
6695 		ifnet_increment_generation(ifp);
6696 	}
6697 
6698 #if NECP
6699 	necp_update_all_clients();
6700 #endif /* NECP */
6701 
6702 	return kev_post_msg(event);
6703 }
6704 
6705 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6706 dlil_post_sifflags_msg(struct ifnet * ifp)
6707 {
6708 	struct kev_msg ev_msg;
6709 	struct net_event_data ev_data;
6710 
6711 	bzero(&ev_data, sizeof(ev_data));
6712 	bzero(&ev_msg, sizeof(ev_msg));
6713 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6714 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6715 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6716 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6717 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6718 	ev_data.if_family = ifp->if_family;
6719 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6720 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6721 	ev_msg.dv[0].data_ptr = &ev_data;
6722 	ev_msg.dv[1].data_length = 0;
6723 	dlil_post_complete_msg(ifp, &ev_msg);
6724 }
6725 
6726 #define TMP_IF_PROTO_ARR_SIZE   10
6727 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6728 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6729 {
6730 	struct ifnet_filter *filter = NULL;
6731 	struct if_proto *proto = NULL;
6732 	int if_proto_count = 0;
6733 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6734 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6735 	int tmp_ifproto_arr_idx = 0;
6736 
6737 	/*
6738 	 * Pass the event to the interface filters
6739 	 */
6740 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6741 	/* prevent filter list from changing in case we drop the lock */
6742 	if_flt_monitor_busy(ifp);
6743 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6744 		if (filter->filt_event != NULL) {
6745 			lck_mtx_unlock(&ifp->if_flt_lock);
6746 
6747 			filter->filt_event(filter->filt_cookie, ifp,
6748 			    filter->filt_protocol, event);
6749 
6750 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6751 		}
6752 	}
6753 	/* we're done with the filter list */
6754 	if_flt_monitor_unbusy(ifp);
6755 	lck_mtx_unlock(&ifp->if_flt_lock);
6756 
6757 	/* Get an io ref count if the interface is attached */
6758 	if (!ifnet_is_attached(ifp, 1)) {
6759 		goto done;
6760 	}
6761 
6762 	/*
6763 	 * An embedded tmp_list_entry in if_proto may still get
6764 	 * over-written by another thread after giving up ifnet lock,
6765 	 * therefore we are avoiding embedded pointers here.
6766 	 */
6767 	ifnet_lock_shared(ifp);
6768 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6769 	if (if_proto_count) {
6770 		int i;
6771 		VERIFY(ifp->if_proto_hash != NULL);
6772 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6773 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6774 		} else {
6775 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6776 			    if_proto_count, Z_WAITOK | Z_ZERO);
6777 			if (tmp_ifproto_arr == NULL) {
6778 				ifnet_lock_done(ifp);
6779 				goto cleanup;
6780 			}
6781 		}
6782 
6783 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6784 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6785 			    next_hash) {
6786 				if_proto_ref(proto);
6787 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6788 				tmp_ifproto_arr_idx++;
6789 			}
6790 		}
6791 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6792 	}
6793 	ifnet_lock_done(ifp);
6794 
6795 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6796 	    tmp_ifproto_arr_idx++) {
6797 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6798 		VERIFY(proto != NULL);
6799 		proto_media_event eventp =
6800 		    (proto->proto_kpi == kProtoKPI_v1 ?
6801 		    proto->kpi.v1.event :
6802 		    proto->kpi.v2.event);
6803 
6804 		if (eventp != NULL) {
6805 			eventp(ifp, proto->protocol_family,
6806 			    event);
6807 		}
6808 		if_proto_free(proto);
6809 	}
6810 
6811 cleanup:
6812 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6813 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6814 	}
6815 
6816 	/* Pass the event to the interface */
6817 	if (ifp->if_event != NULL) {
6818 		ifp->if_event(ifp, event);
6819 	}
6820 
6821 	/* Release the io ref count */
6822 	ifnet_decr_iorefcnt(ifp);
6823 done:
6824 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6825 }
6826 
6827 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6828 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6829 {
6830 	struct kev_msg kev_msg;
6831 	int result = 0;
6832 
6833 	if (ifp == NULL || event == NULL) {
6834 		return EINVAL;
6835 	}
6836 
6837 	bzero(&kev_msg, sizeof(kev_msg));
6838 	kev_msg.vendor_code = event->vendor_code;
6839 	kev_msg.kev_class = event->kev_class;
6840 	kev_msg.kev_subclass = event->kev_subclass;
6841 	kev_msg.event_code = event->event_code;
6842 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6843 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6844 	kev_msg.dv[1].data_length = 0;
6845 
6846 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6847 
6848 	return result;
6849 }
6850 
6851 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6852 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6853 {
6854 	mbuf_t  n = m;
6855 	int chainlen = 0;
6856 
6857 	while (n != NULL) {
6858 		chainlen++;
6859 		n = n->m_next;
6860 	}
6861 	switch (chainlen) {
6862 	case 0:
6863 		break;
6864 	case 1:
6865 		os_atomic_inc(&cls->cls_one, relaxed);
6866 		break;
6867 	case 2:
6868 		os_atomic_inc(&cls->cls_two, relaxed);
6869 		break;
6870 	case 3:
6871 		os_atomic_inc(&cls->cls_three, relaxed);
6872 		break;
6873 	case 4:
6874 		os_atomic_inc(&cls->cls_four, relaxed);
6875 		break;
6876 	case 5:
6877 	default:
6878 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6879 		break;
6880 	}
6881 }
6882 
6883 #if CONFIG_DTRACE
6884 __attribute__((noinline))
6885 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6886 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6887 {
6888 	if (proto_family == PF_INET) {
6889 		struct ip *ip = mtod(m, struct ip *);
6890 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6891 		    struct ip *, ip, struct ifnet *, ifp,
6892 		    struct ip *, ip, struct ip6_hdr *, NULL);
6893 	} else if (proto_family == PF_INET6) {
6894 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6895 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6896 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6897 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6898 	}
6899 }
6900 #endif /* CONFIG_DTRACE */
6901 
6902 /*
6903  * dlil_output
6904  *
6905  * Caller should have a lock on the protocol domain if the protocol
6906  * doesn't support finer grained locking. In most cases, the lock
6907  * will be held from the socket layer and won't be released until
6908  * we return back to the socket layer.
6909  *
6910  * This does mean that we must take a protocol lock before we take
6911  * an interface lock if we're going to take both. This makes sense
6912  * because a protocol is likely to interact with an ifp while it
6913  * is under the protocol lock.
6914  *
6915  * An advisory code will be returned if adv is not null. This
6916  * can be used to provide feedback about interface queues to the
6917  * application.
6918  */
6919 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6920 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6921     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6922 {
6923 	char *frame_type = NULL;
6924 	char *dst_linkaddr = NULL;
6925 	int retval = 0;
6926 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6927 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6928 	struct if_proto *proto = NULL;
6929 	mbuf_t  m = NULL;
6930 	mbuf_t  send_head = NULL;
6931 	mbuf_t  *send_tail = &send_head;
6932 	int iorefcnt = 0;
6933 	u_int32_t pre = 0, post = 0;
6934 	u_int32_t fpkts = 0, fbytes = 0;
6935 	int32_t flen = 0;
6936 	struct timespec now;
6937 	u_int64_t now_nsec;
6938 	boolean_t did_clat46 = FALSE;
6939 	protocol_family_t old_proto_family = proto_family;
6940 	struct sockaddr_in6 dest6;
6941 	struct rtentry *rt = NULL;
6942 	u_int32_t m_loop_set = 0;
6943 
6944 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6945 
6946 	/*
6947 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6948 	 * from happening while this operation is in progress
6949 	 */
6950 	if (!ifnet_datamov_begin(ifp)) {
6951 		retval = ENXIO;
6952 		goto cleanup;
6953 	}
6954 	iorefcnt = 1;
6955 
6956 	VERIFY(ifp->if_output_dlil != NULL);
6957 
6958 	/* update the driver's multicast filter, if needed */
6959 	if (ifp->if_updatemcasts > 0) {
6960 		if_mcasts_update_async(ifp);
6961 		ifp->if_updatemcasts = 0;
6962 	}
6963 
6964 	frame_type = frame_type_buffer;
6965 	dst_linkaddr = dst_linkaddr_buffer;
6966 
6967 	if (raw == 0) {
6968 		ifnet_lock_shared(ifp);
6969 		/* callee holds a proto refcnt upon success */
6970 		proto = find_attached_proto(ifp, proto_family);
6971 		if (proto == NULL) {
6972 			ifnet_lock_done(ifp);
6973 			retval = ENXIO;
6974 			goto cleanup;
6975 		}
6976 		ifnet_lock_done(ifp);
6977 	}
6978 
6979 preout_again:
6980 	if (packetlist == NULL) {
6981 		goto cleanup;
6982 	}
6983 
6984 	m = packetlist;
6985 	packetlist = packetlist->m_nextpkt;
6986 	m->m_nextpkt = NULL;
6987 
6988 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6989 
6990 	/*
6991 	 * Perform address family translation for the first
6992 	 * packet outside the loop in order to perform address
6993 	 * lookup for the translated proto family.
6994 	 */
6995 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6996 	    (ifp->if_type == IFT_CELLULAR ||
6997 	    dlil_is_clat_needed(proto_family, m))) {
6998 		retval = dlil_clat46(ifp, &proto_family, &m);
6999 		/*
7000 		 * Go to the next packet if translation fails
7001 		 */
7002 		if (retval != 0) {
7003 			m_freem(m);
7004 			m = NULL;
7005 			ip6stat.ip6s_clat464_out_drop++;
7006 			/* Make sure that the proto family is PF_INET */
7007 			ASSERT(proto_family == PF_INET);
7008 			goto preout_again;
7009 		}
7010 		/*
7011 		 * Free the old one and make it point to the IPv6 proto structure.
7012 		 *
7013 		 * Change proto for the first time we have successfully
7014 		 * performed address family translation.
7015 		 */
7016 		if (!did_clat46 && proto_family == PF_INET6) {
7017 			did_clat46 = TRUE;
7018 
7019 			if (proto != NULL) {
7020 				if_proto_free(proto);
7021 			}
7022 			ifnet_lock_shared(ifp);
7023 			/* callee holds a proto refcnt upon success */
7024 			proto = find_attached_proto(ifp, proto_family);
7025 			if (proto == NULL) {
7026 				ifnet_lock_done(ifp);
7027 				retval = ENXIO;
7028 				m_freem(m);
7029 				m = NULL;
7030 				goto cleanup;
7031 			}
7032 			ifnet_lock_done(ifp);
7033 			if (ifp->if_type == IFT_ETHER) {
7034 				/* Update the dest to translated v6 address */
7035 				dest6.sin6_len = sizeof(struct sockaddr_in6);
7036 				dest6.sin6_family = AF_INET6;
7037 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7038 				dest = (const struct sockaddr *)&dest6;
7039 
7040 				/*
7041 				 * Lookup route to the translated destination
7042 				 * Free this route ref during cleanup
7043 				 */
7044 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7045 				    0, 0, ifp->if_index);
7046 
7047 				route = rt;
7048 			}
7049 		}
7050 	}
7051 
7052 	/*
7053 	 * This path gets packet chain going to the same destination.
7054 	 * The pre output routine is used to either trigger resolution of
7055 	 * the next hop or retreive the next hop's link layer addressing.
7056 	 * For ex: ether_inet(6)_pre_output routine.
7057 	 *
7058 	 * If the routine returns EJUSTRETURN, it implies that packet has
7059 	 * been queued, and therefore we have to call preout_again for the
7060 	 * following packet in the chain.
7061 	 *
7062 	 * For errors other than EJUSTRETURN, the current packet is freed
7063 	 * and the rest of the chain (pointed by packetlist is freed as
7064 	 * part of clean up.
7065 	 *
7066 	 * Else if there is no error the retrieved information is used for
7067 	 * all the packets in the chain.
7068 	 */
7069 	if (raw == 0) {
7070 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7071 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7072 		retval = 0;
7073 		if (preoutp != NULL) {
7074 			retval = preoutp(ifp, proto_family, &m, dest, route,
7075 			    frame_type, dst_linkaddr);
7076 
7077 			if (retval != 0) {
7078 				if (retval == EJUSTRETURN) {
7079 					goto preout_again;
7080 				}
7081 				m_freem(m);
7082 				m = NULL;
7083 				goto cleanup;
7084 			}
7085 		}
7086 	}
7087 
7088 	do {
7089 		/*
7090 		 * pkt_hdr is set here to point to m_data prior to
7091 		 * calling into the framer. This value of pkt_hdr is
7092 		 * used by the netif gso logic to retrieve the ip header
7093 		 * for the TCP packets, offloaded for TSO processing.
7094 		 */
7095 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7096 			uint8_t vlan_encap_len = 0;
7097 
7098 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7099 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7100 			}
7101 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7102 		} else {
7103 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
7104 		}
7105 
7106 		/*
7107 		 * Perform address family translation if needed.
7108 		 * For now we only support stateless 4 to 6 translation
7109 		 * on the out path.
7110 		 *
7111 		 * The routine below translates IP header, updates protocol
7112 		 * checksum and also translates ICMP.
7113 		 *
7114 		 * We skip the first packet as it is already translated and
7115 		 * the proto family is set to PF_INET6.
7116 		 */
7117 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7118 		    (ifp->if_type == IFT_CELLULAR ||
7119 		    dlil_is_clat_needed(proto_family, m))) {
7120 			retval = dlil_clat46(ifp, &proto_family, &m);
7121 			/* Goto the next packet if the translation fails */
7122 			if (retval != 0) {
7123 				m_freem(m);
7124 				m = NULL;
7125 				ip6stat.ip6s_clat464_out_drop++;
7126 				goto next;
7127 			}
7128 		}
7129 
7130 #if CONFIG_DTRACE
7131 		if (!raw) {
7132 			dlil_output_dtrace(ifp, proto_family, m);
7133 		}
7134 #endif /* CONFIG_DTRACE */
7135 
7136 		if (raw == 0 && ifp->if_framer != NULL) {
7137 			int rcvif_set = 0;
7138 
7139 			/*
7140 			 * If this is a broadcast packet that needs to be
7141 			 * looped back into the system, set the inbound ifp
7142 			 * to that of the outbound ifp.  This will allow
7143 			 * us to determine that it is a legitimate packet
7144 			 * for the system.  Only set the ifp if it's not
7145 			 * already set, just to be safe.
7146 			 */
7147 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7148 			    m->m_pkthdr.rcvif == NULL) {
7149 				m->m_pkthdr.rcvif = ifp;
7150 				rcvif_set = 1;
7151 			}
7152 			m_loop_set = m->m_flags & M_LOOP;
7153 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7154 			    frame_type, &pre, &post);
7155 			if (retval != 0) {
7156 				if (retval != EJUSTRETURN) {
7157 					m_freem(m);
7158 				}
7159 				goto next;
7160 			}
7161 
7162 			/*
7163 			 * For partial checksum offload, adjust the start
7164 			 * and stuff offsets based on the prepended header.
7165 			 */
7166 			if ((m->m_pkthdr.csum_flags &
7167 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7168 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7169 				m->m_pkthdr.csum_tx_stuff += pre;
7170 				m->m_pkthdr.csum_tx_start += pre;
7171 			}
7172 
7173 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7174 				dlil_output_cksum_dbg(ifp, m, pre,
7175 				    proto_family);
7176 			}
7177 
7178 			/*
7179 			 * Clear the ifp if it was set above, and to be
7180 			 * safe, only if it is still the same as the
7181 			 * outbound ifp we have in context.  If it was
7182 			 * looped back, then a copy of it was sent to the
7183 			 * loopback interface with the rcvif set, and we
7184 			 * are clearing the one that will go down to the
7185 			 * layer below.
7186 			 */
7187 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7188 				m->m_pkthdr.rcvif = NULL;
7189 			}
7190 		}
7191 
7192 		/*
7193 		 * Let interface filters (if any) do their thing ...
7194 		 */
7195 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
7196 		if (retval != 0) {
7197 			if (retval != EJUSTRETURN) {
7198 				m_freem(m);
7199 			}
7200 			goto next;
7201 		}
7202 		/*
7203 		 * Strip away M_PROTO1 bit prior to sending packet
7204 		 * to the driver as this field may be used by the driver
7205 		 */
7206 		m->m_flags &= ~M_PROTO1;
7207 
7208 		/*
7209 		 * If the underlying interface is not capable of handling a
7210 		 * packet whose data portion spans across physically disjoint
7211 		 * pages, we need to "normalize" the packet so that we pass
7212 		 * down a chain of mbufs where each mbuf points to a span that
7213 		 * resides in the system page boundary.  If the packet does
7214 		 * not cross page(s), the following is a no-op.
7215 		 */
7216 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7217 			if ((m = m_normalize(m)) == NULL) {
7218 				goto next;
7219 			}
7220 		}
7221 
7222 		/*
7223 		 * If this is a TSO packet, make sure the interface still
7224 		 * advertise TSO capability.
7225 		 */
7226 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7227 			retval = EMSGSIZE;
7228 			m_freem(m);
7229 			goto cleanup;
7230 		}
7231 
7232 		ifp_inc_traffic_class_out(ifp, m);
7233 
7234 #if SKYWALK
7235 		/*
7236 		 * For native skywalk devices, packets will be passed to pktap
7237 		 * after GSO or after the mbuf to packet conversion.
7238 		 * This is done for IPv4/IPv6 packets only because there is no
7239 		 * space in the mbuf to pass down the proto family.
7240 		 */
7241 		if (dlil_is_native_netif_nexus(ifp)) {
7242 			if (raw || m->m_pkthdr.pkt_proto == 0) {
7243 				pktap_output(ifp, proto_family, m, pre, post);
7244 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7245 			}
7246 		} else {
7247 			pktap_output(ifp, proto_family, m, pre, post);
7248 		}
7249 #else /* SKYWALK */
7250 		pktap_output(ifp, proto_family, m, pre, post);
7251 #endif /* SKYWALK */
7252 
7253 		/*
7254 		 * Count the number of elements in the mbuf chain
7255 		 */
7256 		if (tx_chain_len_count) {
7257 			dlil_count_chain_len(m, &tx_chain_len_stats);
7258 		}
7259 
7260 		/*
7261 		 * Record timestamp; ifnet_enqueue() will use this info
7262 		 * rather than redoing the work.  An optimization could
7263 		 * involve doing this just once at the top, if there are
7264 		 * no interface filters attached, but that's probably
7265 		 * not a big deal.
7266 		 */
7267 		nanouptime(&now);
7268 		net_timernsec(&now, &now_nsec);
7269 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
7270 
7271 		/*
7272 		 * Discard partial sum information if this packet originated
7273 		 * from another interface; the packet would already have the
7274 		 * final checksum and we shouldn't recompute it.
7275 		 */
7276 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7277 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7278 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7279 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7280 			m->m_pkthdr.csum_data = 0;
7281 		}
7282 
7283 		/*
7284 		 * Finally, call the driver.
7285 		 */
7286 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7287 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7288 				flen += (m_pktlen(m) - (pre + post));
7289 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7290 			}
7291 			*send_tail = m;
7292 			send_tail = &m->m_nextpkt;
7293 		} else {
7294 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7295 				flen = (m_pktlen(m) - (pre + post));
7296 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7297 			} else {
7298 				flen = 0;
7299 			}
7300 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7301 			    0, 0, 0, 0, 0);
7302 			retval = (*ifp->if_output_dlil)(ifp, m);
7303 			if (retval == EQFULL || retval == EQSUSPENDED) {
7304 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7305 					adv->code = (retval == EQFULL ?
7306 					    FADV_FLOW_CONTROLLED :
7307 					    FADV_SUSPENDED);
7308 				}
7309 				retval = 0;
7310 			}
7311 			if (retval == 0 && flen > 0) {
7312 				fbytes += flen;
7313 				fpkts++;
7314 			}
7315 			if (retval != 0 && dlil_verbose) {
7316 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7317 				    __func__, if_name(ifp),
7318 				    retval);
7319 			}
7320 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7321 			    0, 0, 0, 0, 0);
7322 		}
7323 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7324 
7325 next:
7326 		m = packetlist;
7327 		if (m != NULL) {
7328 			m->m_flags |= m_loop_set;
7329 			packetlist = packetlist->m_nextpkt;
7330 			m->m_nextpkt = NULL;
7331 		}
7332 		/* Reset the proto family to old proto family for CLAT */
7333 		if (did_clat46) {
7334 			proto_family = old_proto_family;
7335 		}
7336 	} while (m != NULL);
7337 
7338 	if (send_head != NULL) {
7339 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7340 		    0, 0, 0, 0, 0);
7341 		if (ifp->if_eflags & IFEF_SENDLIST) {
7342 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7343 			if (retval == EQFULL || retval == EQSUSPENDED) {
7344 				if (adv != NULL) {
7345 					adv->code = (retval == EQFULL ?
7346 					    FADV_FLOW_CONTROLLED :
7347 					    FADV_SUSPENDED);
7348 				}
7349 				retval = 0;
7350 			}
7351 			if (retval == 0 && flen > 0) {
7352 				fbytes += flen;
7353 				fpkts++;
7354 			}
7355 			if (retval != 0 && dlil_verbose) {
7356 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7357 				    __func__, if_name(ifp), retval);
7358 			}
7359 		} else {
7360 			struct mbuf *send_m;
7361 			int enq_cnt = 0;
7362 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7363 			while (send_head != NULL) {
7364 				send_m = send_head;
7365 				send_head = send_m->m_nextpkt;
7366 				send_m->m_nextpkt = NULL;
7367 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7368 				if (retval == EQFULL || retval == EQSUSPENDED) {
7369 					if (adv != NULL) {
7370 						adv->code = (retval == EQFULL ?
7371 						    FADV_FLOW_CONTROLLED :
7372 						    FADV_SUSPENDED);
7373 					}
7374 					retval = 0;
7375 				}
7376 				if (retval == 0) {
7377 					enq_cnt++;
7378 					if (flen > 0) {
7379 						fpkts++;
7380 					}
7381 				}
7382 				if (retval != 0 && dlil_verbose) {
7383 					DLIL_PRINTF("%s: output error on %s "
7384 					    "retval = %d\n",
7385 					    __func__, if_name(ifp), retval);
7386 				}
7387 			}
7388 			if (enq_cnt > 0) {
7389 				fbytes += flen;
7390 				ifnet_start(ifp);
7391 			}
7392 		}
7393 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7394 	}
7395 
7396 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7397 
7398 cleanup:
7399 	if (fbytes > 0) {
7400 		ifp->if_fbytes += fbytes;
7401 	}
7402 	if (fpkts > 0) {
7403 		ifp->if_fpackets += fpkts;
7404 	}
7405 	if (proto != NULL) {
7406 		if_proto_free(proto);
7407 	}
7408 	if (packetlist) { /* if any packets are left, clean up */
7409 		mbuf_freem_list(packetlist);
7410 	}
7411 	if (retval == EJUSTRETURN) {
7412 		retval = 0;
7413 	}
7414 	if (iorefcnt == 1) {
7415 		ifnet_datamov_end(ifp);
7416 	}
7417 	if (rt != NULL) {
7418 		rtfree(rt);
7419 		rt = NULL;
7420 	}
7421 
7422 	return retval;
7423 }
7424 
7425 /*
7426  * This routine checks if the destination address is not a loopback, link-local,
7427  * multicast or broadcast address.
7428  */
7429 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7430 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7431 {
7432 	int ret = 0;
7433 	switch (proto_family) {
7434 	case PF_INET: {
7435 		struct ip *iph = mtod(m, struct ip *);
7436 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7437 			ret = 1;
7438 		}
7439 		break;
7440 	}
7441 	case PF_INET6: {
7442 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7443 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7444 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7445 			ret = 1;
7446 		}
7447 		break;
7448 	}
7449 	}
7450 
7451 	return ret;
7452 }
7453 /*
7454  * @brief This routine translates IPv4 packet to IPv6 packet,
7455  *     updates protocol checksum and also translates ICMP for code
7456  *     along with inner header translation.
7457  *
7458  * @param ifp Pointer to the interface
7459  * @param proto_family pointer to protocol family. It is updated if function
7460  *     performs the translation successfully.
7461  * @param m Pointer to the pointer pointing to the packet. Needed because this
7462  *     routine can end up changing the mbuf to a different one.
7463  *
7464  * @return 0 on success or else a negative value.
7465  */
7466 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7467 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7468 {
7469 	VERIFY(*proto_family == PF_INET);
7470 	VERIFY(IS_INTF_CLAT46(ifp));
7471 
7472 	pbuf_t pbuf_store, *pbuf = NULL;
7473 	struct ip *iph = NULL;
7474 	struct in_addr osrc, odst;
7475 	uint8_t proto = 0;
7476 	struct in6_ifaddr *ia6_clat_src = NULL;
7477 	struct in6_addr *src = NULL;
7478 	struct in6_addr dst;
7479 	int error = 0;
7480 	uint16_t off = 0;
7481 	uint16_t tot_len = 0;
7482 	uint16_t ip_id_val = 0;
7483 	uint16_t ip_frag_off = 0;
7484 
7485 	boolean_t is_frag = FALSE;
7486 	boolean_t is_first_frag = TRUE;
7487 	boolean_t is_last_frag = TRUE;
7488 
7489 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7490 	pbuf = &pbuf_store;
7491 	iph = pbuf->pb_data;
7492 
7493 	osrc = iph->ip_src;
7494 	odst = iph->ip_dst;
7495 	proto = iph->ip_p;
7496 	off = (uint16_t)(iph->ip_hl << 2);
7497 	ip_id_val = iph->ip_id;
7498 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7499 
7500 	tot_len = ntohs(iph->ip_len);
7501 
7502 	/*
7503 	 * For packets that are not first frags
7504 	 * we only need to adjust CSUM.
7505 	 * For 4 to 6, Fragmentation header gets appended
7506 	 * after proto translation.
7507 	 */
7508 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7509 		is_frag = TRUE;
7510 
7511 		/* If the offset is not zero, it is not first frag */
7512 		if (ip_frag_off != 0) {
7513 			is_first_frag = FALSE;
7514 		}
7515 
7516 		/* If IP_MF is set, then it is not last frag */
7517 		if (ntohs(iph->ip_off) & IP_MF) {
7518 			is_last_frag = FALSE;
7519 		}
7520 	}
7521 
7522 	/*
7523 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7524 	 * translation.
7525 	 */
7526 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7527 	if (ia6_clat_src == NULL) {
7528 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7529 		error = -1;
7530 		goto cleanup;
7531 	}
7532 
7533 	src = &ia6_clat_src->ia_addr.sin6_addr;
7534 
7535 	/*
7536 	 * Translate IPv4 destination to IPv6 destination by using the
7537 	 * prefixes learned through prior PLAT discovery.
7538 	 */
7539 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7540 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7541 		goto cleanup;
7542 	}
7543 
7544 	/* Translate the IP header part first */
7545 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7546 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7547 
7548 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7549 
7550 	if (error != 0) {
7551 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7552 		goto cleanup;
7553 	}
7554 
7555 	/*
7556 	 * Translate protocol header, update checksum, checksum flags
7557 	 * and related fields.
7558 	 */
7559 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7560 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7561 
7562 	if (error != 0) {
7563 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7564 		goto cleanup;
7565 	}
7566 
7567 	/* Now insert the IPv6 fragment header */
7568 	if (is_frag) {
7569 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7570 
7571 		if (error != 0) {
7572 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7573 			goto cleanup;
7574 		}
7575 	}
7576 
7577 cleanup:
7578 	if (ia6_clat_src != NULL) {
7579 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7580 	}
7581 
7582 	if (pbuf_is_valid(pbuf)) {
7583 		*m = pbuf->pb_mbuf;
7584 		pbuf->pb_mbuf = NULL;
7585 		pbuf_destroy(pbuf);
7586 	} else {
7587 		error = -1;
7588 		*m = NULL;
7589 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7590 	}
7591 
7592 	if (error == 0) {
7593 		*proto_family = PF_INET6;
7594 		ip6stat.ip6s_clat464_out_success++;
7595 	}
7596 
7597 	return error;
7598 }
7599 
7600 /*
7601  * @brief This routine translates incoming IPv6 to IPv4 packet,
7602  *     updates protocol checksum and also translates ICMPv6 outer
7603  *     and inner headers
7604  *
7605  * @return 0 on success or else a negative value.
7606  */
7607 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7608 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7609 {
7610 	VERIFY(*proto_family == PF_INET6);
7611 	VERIFY(IS_INTF_CLAT46(ifp));
7612 
7613 	struct ip6_hdr *ip6h = NULL;
7614 	struct in6_addr osrc, odst;
7615 	uint8_t proto = 0;
7616 	struct in6_ifaddr *ia6_clat_dst = NULL;
7617 	struct in_ifaddr *ia4_clat_dst = NULL;
7618 	struct in_addr *dst = NULL;
7619 	struct in_addr src;
7620 	int error = 0;
7621 	uint32_t off = 0;
7622 	u_int64_t tot_len = 0;
7623 	uint8_t tos = 0;
7624 	boolean_t is_first_frag = TRUE;
7625 
7626 	/* Incoming mbuf does not contain valid IP6 header */
7627 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7628 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7629 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7630 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7631 		return -1;
7632 	}
7633 
7634 	ip6h = mtod(*m, struct ip6_hdr *);
7635 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7636 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7637 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7638 		return -1;
7639 	}
7640 
7641 	osrc = ip6h->ip6_src;
7642 	odst = ip6h->ip6_dst;
7643 
7644 	/*
7645 	 * Retrieve the local CLAT46 reserved IPv6 address.
7646 	 * Let the packet pass if we don't find one, as the flag
7647 	 * may get set before IPv6 configuration has taken place.
7648 	 */
7649 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7650 	if (ia6_clat_dst == NULL) {
7651 		goto done;
7652 	}
7653 
7654 	/*
7655 	 * Check if the original dest in the packet is same as the reserved
7656 	 * CLAT46 IPv6 address
7657 	 */
7658 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7659 		pbuf_t pbuf_store, *pbuf = NULL;
7660 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7661 		pbuf = &pbuf_store;
7662 
7663 		/*
7664 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7665 		 * translation.
7666 		 */
7667 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7668 		if (ia4_clat_dst == NULL) {
7669 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7670 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7671 			error = -1;
7672 			goto cleanup;
7673 		}
7674 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7675 
7676 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7677 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7678 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7679 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7680 			error = -1;
7681 			goto cleanup;
7682 		}
7683 
7684 		ip6h = pbuf->pb_data;
7685 		off = sizeof(struct ip6_hdr);
7686 		proto = ip6h->ip6_nxt;
7687 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7688 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7689 
7690 		/*
7691 		 * Translate the IP header and update the fragmentation
7692 		 * header if needed
7693 		 */
7694 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7695 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7696 		    0 : -1;
7697 
7698 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7699 
7700 		if (error != 0) {
7701 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7702 			goto cleanup;
7703 		}
7704 
7705 		/*
7706 		 * Translate protocol header, update checksum, checksum flags
7707 		 * and related fields.
7708 		 */
7709 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7710 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7711 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7712 
7713 		if (error != 0) {
7714 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7715 			goto cleanup;
7716 		}
7717 
7718 cleanup:
7719 		if (ia4_clat_dst != NULL) {
7720 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7721 		}
7722 
7723 		if (pbuf_is_valid(pbuf)) {
7724 			*m = pbuf->pb_mbuf;
7725 			pbuf->pb_mbuf = NULL;
7726 			pbuf_destroy(pbuf);
7727 		} else {
7728 			error = -1;
7729 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7730 		}
7731 
7732 		if (error == 0) {
7733 			*proto_family = PF_INET;
7734 			ip6stat.ip6s_clat464_in_success++;
7735 		}
7736 	} /* CLAT traffic */
7737 
7738 done:
7739 	return error;
7740 }
7741 
7742 /* The following is used to enqueue work items for ifnet ioctl events */
7743 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7744 
7745 struct ifnet_ioctl_event {
7746 	struct ifnet *ifp;
7747 	u_long ioctl_code;
7748 };
7749 
7750 struct ifnet_ioctl_event_nwk_wq_entry {
7751 	struct nwk_wq_entry nwk_wqe;
7752 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7753 };
7754 
7755 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7756 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7757 {
7758 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7759 	bool compare_expected;
7760 
7761 	/*
7762 	 * Get an io ref count if the interface is attached.
7763 	 * At this point it most likely is. We are taking a reference for
7764 	 * deferred processing.
7765 	 */
7766 	if (!ifnet_is_attached(ifp, 1)) {
7767 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7768 		    "is not attached",
7769 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7770 		return;
7771 	}
7772 	switch (ioctl_code) {
7773 	case SIOCADDMULTI:
7774 		compare_expected = false;
7775 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7776 			ifnet_decr_iorefcnt(ifp);
7777 			return;
7778 		}
7779 		break;
7780 	case SIOCDELMULTI:
7781 		compare_expected = false;
7782 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7783 			ifnet_decr_iorefcnt(ifp);
7784 			return;
7785 		}
7786 		break;
7787 	default:
7788 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7789 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7790 		return;
7791 	}
7792 
7793 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7794 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7795 
7796 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7797 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7798 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7799 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7800 }
7801 
7802 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7803 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7804 {
7805 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7806 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7807 
7808 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7809 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7810 	int ret = 0;
7811 
7812 	switch (ioctl_code) {
7813 	case SIOCADDMULTI:
7814 		atomic_store(&ifp->if_mcast_add_signaled, false);
7815 		break;
7816 	case SIOCDELMULTI:
7817 		atomic_store(&ifp->if_mcast_del_signaled, false);
7818 		break;
7819 	}
7820 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7821 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7822 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7823 	} else if (dlil_verbose) {
7824 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7825 		    "for ioctl %lu",
7826 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7827 	}
7828 	ifnet_decr_iorefcnt(ifp);
7829 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7830 	return;
7831 }
7832 
7833 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7834 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7835     void *ioctl_arg)
7836 {
7837 	struct ifnet_filter *filter;
7838 	int retval = EOPNOTSUPP;
7839 	int result = 0;
7840 
7841 	if (ifp == NULL || ioctl_code == 0) {
7842 		return EINVAL;
7843 	}
7844 
7845 	/* Get an io ref count if the interface is attached */
7846 	if (!ifnet_is_attached(ifp, 1)) {
7847 		return EOPNOTSUPP;
7848 	}
7849 
7850 	/*
7851 	 * Run the interface filters first.
7852 	 * We want to run all filters before calling the protocol,
7853 	 * interface family, or interface.
7854 	 */
7855 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7856 	/* prevent filter list from changing in case we drop the lock */
7857 	if_flt_monitor_busy(ifp);
7858 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7859 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7860 		    filter->filt_protocol == proto_fam)) {
7861 			lck_mtx_unlock(&ifp->if_flt_lock);
7862 
7863 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7864 			    proto_fam, ioctl_code, ioctl_arg);
7865 
7866 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7867 
7868 			/* Only update retval if no one has handled the ioctl */
7869 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7870 				if (result == ENOTSUP) {
7871 					result = EOPNOTSUPP;
7872 				}
7873 				retval = result;
7874 				if (retval != 0 && retval != EOPNOTSUPP) {
7875 					/* we're done with the filter list */
7876 					if_flt_monitor_unbusy(ifp);
7877 					lck_mtx_unlock(&ifp->if_flt_lock);
7878 					goto cleanup;
7879 				}
7880 			}
7881 		}
7882 	}
7883 	/* we're done with the filter list */
7884 	if_flt_monitor_unbusy(ifp);
7885 	lck_mtx_unlock(&ifp->if_flt_lock);
7886 
7887 	/* Allow the protocol to handle the ioctl */
7888 	if (proto_fam != 0) {
7889 		struct if_proto *proto;
7890 
7891 		/* callee holds a proto refcnt upon success */
7892 		ifnet_lock_shared(ifp);
7893 		proto = find_attached_proto(ifp, proto_fam);
7894 		ifnet_lock_done(ifp);
7895 		if (proto != NULL) {
7896 			proto_media_ioctl ioctlp =
7897 			    (proto->proto_kpi == kProtoKPI_v1 ?
7898 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7899 			result = EOPNOTSUPP;
7900 			if (ioctlp != NULL) {
7901 				result = ioctlp(ifp, proto_fam, ioctl_code,
7902 				    ioctl_arg);
7903 			}
7904 			if_proto_free(proto);
7905 
7906 			/* Only update retval if no one has handled the ioctl */
7907 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7908 				if (result == ENOTSUP) {
7909 					result = EOPNOTSUPP;
7910 				}
7911 				retval = result;
7912 				if (retval && retval != EOPNOTSUPP) {
7913 					goto cleanup;
7914 				}
7915 			}
7916 		}
7917 	}
7918 
7919 	/* retval is either 0 or EOPNOTSUPP */
7920 
7921 	/*
7922 	 * Let the interface handle this ioctl.
7923 	 * If it returns EOPNOTSUPP, ignore that, we may have
7924 	 * already handled this in the protocol or family.
7925 	 */
7926 	if (ifp->if_ioctl) {
7927 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7928 	}
7929 
7930 	/* Only update retval if no one has handled the ioctl */
7931 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7932 		if (result == ENOTSUP) {
7933 			result = EOPNOTSUPP;
7934 		}
7935 		retval = result;
7936 		if (retval && retval != EOPNOTSUPP) {
7937 			goto cleanup;
7938 		}
7939 	}
7940 
7941 cleanup:
7942 	if (retval == EJUSTRETURN) {
7943 		retval = 0;
7944 	}
7945 
7946 	ifnet_decr_iorefcnt(ifp);
7947 
7948 	return retval;
7949 }
7950 
7951 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7952 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7953 {
7954 	errno_t error = 0;
7955 
7956 	if (ifp->if_set_bpf_tap) {
7957 		/* Get an io reference on the interface if it is attached */
7958 		if (!ifnet_is_attached(ifp, 1)) {
7959 			return ENXIO;
7960 		}
7961 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7962 		ifnet_decr_iorefcnt(ifp);
7963 	}
7964 	return error;
7965 }
7966 
7967 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7968 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7969     struct sockaddr *ll_addr, size_t ll_len)
7970 {
7971 	errno_t result = EOPNOTSUPP;
7972 	struct if_proto *proto;
7973 	const struct sockaddr *verify;
7974 	proto_media_resolve_multi resolvep;
7975 
7976 	if (!ifnet_is_attached(ifp, 1)) {
7977 		return result;
7978 	}
7979 
7980 	bzero(ll_addr, ll_len);
7981 
7982 	/* Call the protocol first; callee holds a proto refcnt upon success */
7983 	ifnet_lock_shared(ifp);
7984 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7985 	ifnet_lock_done(ifp);
7986 	if (proto != NULL) {
7987 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7988 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7989 		if (resolvep != NULL) {
7990 			result = resolvep(ifp, proto_addr,
7991 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7992 		}
7993 		if_proto_free(proto);
7994 	}
7995 
7996 	/* Let the interface verify the multicast address */
7997 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7998 		if (result == 0) {
7999 			verify = ll_addr;
8000 		} else {
8001 			verify = proto_addr;
8002 		}
8003 		result = ifp->if_check_multi(ifp, verify);
8004 	}
8005 
8006 	ifnet_decr_iorefcnt(ifp);
8007 	return result;
8008 }
8009 
8010 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8011 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8012     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8013     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8014 {
8015 	struct if_proto *proto;
8016 	errno_t result = 0;
8017 
8018 	if ((ifp->if_flags & IFF_NOARP) != 0) {
8019 		result = ENOTSUP;
8020 		goto done;
8021 	}
8022 
8023 	/* callee holds a proto refcnt upon success */
8024 	ifnet_lock_shared(ifp);
8025 	proto = find_attached_proto(ifp, target_proto->sa_family);
8026 	ifnet_lock_done(ifp);
8027 	if (proto == NULL) {
8028 		result = ENOTSUP;
8029 	} else {
8030 		proto_media_send_arp    arpp;
8031 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8032 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8033 		if (arpp == NULL) {
8034 			result = ENOTSUP;
8035 		} else {
8036 			switch (arpop) {
8037 			case ARPOP_REQUEST:
8038 				arpstat.txrequests++;
8039 				if (target_hw != NULL) {
8040 					arpstat.txurequests++;
8041 				}
8042 				break;
8043 			case ARPOP_REPLY:
8044 				arpstat.txreplies++;
8045 				break;
8046 			}
8047 			result = arpp(ifp, arpop, sender_hw, sender_proto,
8048 			    target_hw, target_proto);
8049 		}
8050 		if_proto_free(proto);
8051 	}
8052 done:
8053 	return result;
8054 }
8055 
8056 struct net_thread_marks { };
8057 static const struct net_thread_marks net_thread_marks_base = { };
8058 
8059 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8060     &net_thread_marks_base;
8061 
8062 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8063 net_thread_marks_push(u_int32_t push)
8064 {
8065 	static const char *const base = (const void*)&net_thread_marks_base;
8066 	u_int32_t pop = 0;
8067 
8068 	if (push != 0) {
8069 		struct uthread *uth = current_uthread();
8070 
8071 		pop = push & ~uth->uu_network_marks;
8072 		if (pop != 0) {
8073 			uth->uu_network_marks |= pop;
8074 		}
8075 	}
8076 
8077 	return (net_thread_marks_t)&base[pop];
8078 }
8079 
8080 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8081 net_thread_unmarks_push(u_int32_t unpush)
8082 {
8083 	static const char *const base = (const void*)&net_thread_marks_base;
8084 	u_int32_t unpop = 0;
8085 
8086 	if (unpush != 0) {
8087 		struct uthread *uth = current_uthread();
8088 
8089 		unpop = unpush & uth->uu_network_marks;
8090 		if (unpop != 0) {
8091 			uth->uu_network_marks &= ~unpop;
8092 		}
8093 	}
8094 
8095 	return (net_thread_marks_t)&base[unpop];
8096 }
8097 
8098 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8099 net_thread_marks_pop(net_thread_marks_t popx)
8100 {
8101 	static const char *const base = (const void*)&net_thread_marks_base;
8102 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
8103 
8104 	if (pop != 0) {
8105 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8106 		struct uthread *uth = current_uthread();
8107 
8108 		VERIFY((pop & ones) == pop);
8109 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8110 		uth->uu_network_marks &= ~pop;
8111 	}
8112 }
8113 
8114 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8115 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8116 {
8117 	static const char *const base = (const void*)&net_thread_marks_base;
8118 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8119 
8120 	if (unpop != 0) {
8121 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8122 		struct uthread *uth = current_uthread();
8123 
8124 		VERIFY((unpop & ones) == unpop);
8125 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8126 		uth->uu_network_marks |= unpop;
8127 	}
8128 }
8129 
8130 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8131 net_thread_is_marked(u_int32_t check)
8132 {
8133 	if (check != 0) {
8134 		struct uthread *uth = current_uthread();
8135 		return uth->uu_network_marks & check;
8136 	} else {
8137 		return 0;
8138 	}
8139 }
8140 
8141 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8142 net_thread_is_unmarked(u_int32_t check)
8143 {
8144 	if (check != 0) {
8145 		struct uthread *uth = current_uthread();
8146 		return ~uth->uu_network_marks & check;
8147 	} else {
8148 		return 0;
8149 	}
8150 }
8151 
8152 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8153 _is_announcement(const struct sockaddr_in * sender_sin,
8154     const struct sockaddr_in * target_sin)
8155 {
8156 	if (target_sin == NULL || sender_sin == NULL) {
8157 		return FALSE;
8158 	}
8159 
8160 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8161 }
8162 
8163 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8164 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8165     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8166     const struct sockaddr *target_proto0, u_int32_t rtflags)
8167 {
8168 	errno_t result = 0;
8169 	const struct sockaddr_in * sender_sin;
8170 	const struct sockaddr_in * target_sin;
8171 	struct sockaddr_inarp target_proto_sinarp;
8172 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8173 
8174 	if (target_proto == NULL || sender_proto == NULL) {
8175 		return EINVAL;
8176 	}
8177 
8178 	if (sender_proto->sa_family != target_proto->sa_family) {
8179 		return EINVAL;
8180 	}
8181 
8182 	/*
8183 	 * If the target is a (default) router, provide that
8184 	 * information to the send_arp callback routine.
8185 	 */
8186 	if (rtflags & RTF_ROUTER) {
8187 		bcopy(target_proto, &target_proto_sinarp,
8188 		    sizeof(struct sockaddr_in));
8189 		target_proto_sinarp.sin_other |= SIN_ROUTER;
8190 		target_proto = (struct sockaddr *)&target_proto_sinarp;
8191 	}
8192 
8193 	/*
8194 	 * If this is an ARP request and the target IP is IPv4LL,
8195 	 * send the request on all interfaces.  The exception is
8196 	 * an announcement, which must only appear on the specific
8197 	 * interface.
8198 	 */
8199 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8200 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8201 	if (target_proto->sa_family == AF_INET &&
8202 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8203 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8204 	    !_is_announcement(sender_sin, target_sin)) {
8205 		ifnet_t         *ifp_list;
8206 		u_int32_t       count;
8207 		u_int32_t       ifp_on;
8208 
8209 		result = ENOTSUP;
8210 
8211 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8212 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
8213 				errno_t new_result;
8214 				ifaddr_t source_hw = NULL;
8215 				ifaddr_t source_ip = NULL;
8216 				struct sockaddr_in source_ip_copy;
8217 				struct ifnet *cur_ifp = ifp_list[ifp_on];
8218 
8219 				/*
8220 				 * Only arp on interfaces marked for IPv4LL
8221 				 * ARPing.  This may mean that we don't ARP on
8222 				 * the interface the subnet route points to.
8223 				 */
8224 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8225 					continue;
8226 				}
8227 
8228 				/* Find the source IP address */
8229 				ifnet_lock_shared(cur_ifp);
8230 				source_hw = cur_ifp->if_lladdr;
8231 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8232 				    ifa_link) {
8233 					IFA_LOCK(source_ip);
8234 					if (source_ip->ifa_addr != NULL &&
8235 					    source_ip->ifa_addr->sa_family ==
8236 					    AF_INET) {
8237 						/* Copy the source IP address */
8238 						source_ip_copy =
8239 						    *(struct sockaddr_in *)
8240 						    (void *)source_ip->ifa_addr;
8241 						IFA_UNLOCK(source_ip);
8242 						break;
8243 					}
8244 					IFA_UNLOCK(source_ip);
8245 				}
8246 
8247 				/* No IP Source, don't arp */
8248 				if (source_ip == NULL) {
8249 					ifnet_lock_done(cur_ifp);
8250 					continue;
8251 				}
8252 
8253 				IFA_ADDREF(source_hw);
8254 				ifnet_lock_done(cur_ifp);
8255 
8256 				/* Send the ARP */
8257 				new_result = dlil_send_arp_internal(cur_ifp,
8258 				    arpop, (struct sockaddr_dl *)(void *)
8259 				    source_hw->ifa_addr,
8260 				    (struct sockaddr *)&source_ip_copy, NULL,
8261 				    target_proto);
8262 
8263 				IFA_REMREF(source_hw);
8264 				if (result == ENOTSUP) {
8265 					result = new_result;
8266 				}
8267 			}
8268 			ifnet_list_free(ifp_list);
8269 		}
8270 	} else {
8271 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8272 		    sender_proto, target_hw, target_proto);
8273 	}
8274 
8275 	return result;
8276 }
8277 
8278 /*
8279  * Caller must hold ifnet head lock.
8280  */
8281 static int
ifnet_lookup(struct ifnet * ifp)8282 ifnet_lookup(struct ifnet *ifp)
8283 {
8284 	struct ifnet *_ifp;
8285 
8286 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8287 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8288 		if (_ifp == ifp) {
8289 			break;
8290 		}
8291 	}
8292 	return _ifp != NULL;
8293 }
8294 
8295 /*
8296  * Caller has to pass a non-zero refio argument to get a
8297  * IO reference count. This will prevent ifnet_detach from
8298  * being called when there are outstanding io reference counts.
8299  */
8300 int
ifnet_is_attached(struct ifnet * ifp,int refio)8301 ifnet_is_attached(struct ifnet *ifp, int refio)
8302 {
8303 	int ret;
8304 
8305 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8306 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
8307 		if (refio > 0) {
8308 			ifp->if_refio++;
8309 		}
8310 	}
8311 	lck_mtx_unlock(&ifp->if_ref_lock);
8312 
8313 	return ret;
8314 }
8315 
8316 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8317 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8318 {
8319 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8320 	ifp->if_threads_pending++;
8321 	lck_mtx_unlock(&ifp->if_ref_lock);
8322 }
8323 
8324 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8325 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8326 {
8327 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8328 	VERIFY(ifp->if_threads_pending > 0);
8329 	ifp->if_threads_pending--;
8330 	if (ifp->if_threads_pending == 0) {
8331 		wakeup(&ifp->if_threads_pending);
8332 	}
8333 	lck_mtx_unlock(&ifp->if_ref_lock);
8334 }
8335 
8336 /*
8337  * Caller must ensure the interface is attached; the assumption is that
8338  * there is at least an outstanding IO reference count held already.
8339  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8340  */
8341 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8342 ifnet_incr_iorefcnt(struct ifnet *ifp)
8343 {
8344 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8345 	VERIFY(IF_FULLY_ATTACHED(ifp));
8346 	VERIFY(ifp->if_refio > 0);
8347 	ifp->if_refio++;
8348 	lck_mtx_unlock(&ifp->if_ref_lock);
8349 }
8350 
8351 __attribute__((always_inline))
8352 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8353 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8354 {
8355 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8356 
8357 	VERIFY(ifp->if_refio > 0);
8358 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8359 
8360 	ifp->if_refio--;
8361 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8362 
8363 	/*
8364 	 * if there are no more outstanding io references, wakeup the
8365 	 * ifnet_detach thread if detaching flag is set.
8366 	 */
8367 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8368 		wakeup(&(ifp->if_refio));
8369 	}
8370 }
8371 
8372 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8373 ifnet_decr_iorefcnt(struct ifnet *ifp)
8374 {
8375 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8376 	ifnet_decr_iorefcnt_locked(ifp);
8377 	lck_mtx_unlock(&ifp->if_ref_lock);
8378 }
8379 
8380 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8381 ifnet_datamov_begin(struct ifnet *ifp)
8382 {
8383 	boolean_t ret;
8384 
8385 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8386 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8387 		ifp->if_refio++;
8388 		ifp->if_datamov++;
8389 	}
8390 	lck_mtx_unlock(&ifp->if_ref_lock);
8391 
8392 	return ret;
8393 }
8394 
8395 void
ifnet_datamov_end(struct ifnet * ifp)8396 ifnet_datamov_end(struct ifnet *ifp)
8397 {
8398 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8399 	VERIFY(ifp->if_datamov > 0);
8400 	/*
8401 	 * if there's no more thread moving data, wakeup any
8402 	 * drainers that's blocked waiting for this.
8403 	 */
8404 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8405 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8406 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8407 		wakeup(&(ifp->if_datamov));
8408 	}
8409 	ifnet_decr_iorefcnt_locked(ifp);
8410 	lck_mtx_unlock(&ifp->if_ref_lock);
8411 }
8412 
8413 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8414 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8415 {
8416 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8417 	ifp->if_refio++;
8418 	if (ifp->if_suspend++ == 0) {
8419 		VERIFY(ifp->if_refflags & IFRF_READY);
8420 		ifp->if_refflags &= ~IFRF_READY;
8421 	}
8422 }
8423 
8424 void
ifnet_datamov_suspend(struct ifnet * ifp)8425 ifnet_datamov_suspend(struct ifnet *ifp)
8426 {
8427 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8428 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8429 	ifnet_datamov_suspend_locked(ifp);
8430 	lck_mtx_unlock(&ifp->if_ref_lock);
8431 }
8432 
8433 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8434 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8435 {
8436 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8437 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8438 	if (ifp->if_suspend > 0) {
8439 		lck_mtx_unlock(&ifp->if_ref_lock);
8440 		return FALSE;
8441 	}
8442 	ifnet_datamov_suspend_locked(ifp);
8443 	lck_mtx_unlock(&ifp->if_ref_lock);
8444 	return TRUE;
8445 }
8446 
8447 void
ifnet_datamov_drain(struct ifnet * ifp)8448 ifnet_datamov_drain(struct ifnet *ifp)
8449 {
8450 	lck_mtx_lock(&ifp->if_ref_lock);
8451 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8452 	/* data movement must already be suspended */
8453 	VERIFY(ifp->if_suspend > 0);
8454 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8455 	ifp->if_drainers++;
8456 	while (ifp->if_datamov != 0) {
8457 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8458 		    if_name(ifp));
8459 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8460 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8461 		    (PZERO - 1), __func__, NULL);
8462 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8463 	}
8464 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8465 	VERIFY(ifp->if_drainers > 0);
8466 	ifp->if_drainers--;
8467 	lck_mtx_unlock(&ifp->if_ref_lock);
8468 
8469 	/* purge the interface queues */
8470 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8471 		if_qflush_snd(ifp, false);
8472 	}
8473 }
8474 
8475 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8476 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8477 {
8478 	ifnet_datamov_suspend(ifp);
8479 	ifnet_datamov_drain(ifp);
8480 }
8481 
8482 void
ifnet_datamov_resume(struct ifnet * ifp)8483 ifnet_datamov_resume(struct ifnet *ifp)
8484 {
8485 	lck_mtx_lock(&ifp->if_ref_lock);
8486 	/* data movement must already be suspended */
8487 	VERIFY(ifp->if_suspend > 0);
8488 	if (--ifp->if_suspend == 0) {
8489 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8490 		ifp->if_refflags |= IFRF_READY;
8491 	}
8492 	ifnet_decr_iorefcnt_locked(ifp);
8493 	lck_mtx_unlock(&ifp->if_ref_lock);
8494 }
8495 
8496 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8497 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8498 {
8499 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8500 	ctrace_t *tr;
8501 	u_int32_t idx;
8502 	u_int16_t *cnt;
8503 
8504 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8505 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8506 		/* NOTREACHED */
8507 	}
8508 
8509 	if (refhold) {
8510 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8511 		tr = dl_if_dbg->dldbg_if_refhold;
8512 	} else {
8513 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8514 		tr = dl_if_dbg->dldbg_if_refrele;
8515 	}
8516 
8517 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8518 	ctrace_record(&tr[idx]);
8519 }
8520 
8521 errno_t
dlil_if_ref(struct ifnet * ifp)8522 dlil_if_ref(struct ifnet *ifp)
8523 {
8524 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8525 
8526 	if (dl_if == NULL) {
8527 		return EINVAL;
8528 	}
8529 
8530 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8531 	++dl_if->dl_if_refcnt;
8532 	if (dl_if->dl_if_refcnt == 0) {
8533 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8534 		/* NOTREACHED */
8535 	}
8536 	if (dl_if->dl_if_trace != NULL) {
8537 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8538 	}
8539 	lck_mtx_unlock(&dl_if->dl_if_lock);
8540 
8541 	return 0;
8542 }
8543 
8544 errno_t
dlil_if_free(struct ifnet * ifp)8545 dlil_if_free(struct ifnet *ifp)
8546 {
8547 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8548 	bool need_release = FALSE;
8549 
8550 	if (dl_if == NULL) {
8551 		return EINVAL;
8552 	}
8553 
8554 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8555 	switch (dl_if->dl_if_refcnt) {
8556 	case 0:
8557 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8558 		/* NOTREACHED */
8559 		break;
8560 	case 1:
8561 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8562 			need_release = TRUE;
8563 		}
8564 		break;
8565 	default:
8566 		break;
8567 	}
8568 	--dl_if->dl_if_refcnt;
8569 	if (dl_if->dl_if_trace != NULL) {
8570 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8571 	}
8572 	lck_mtx_unlock(&dl_if->dl_if_lock);
8573 	if (need_release) {
8574 		_dlil_if_release(ifp, true);
8575 	}
8576 	return 0;
8577 }
8578 
8579 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8580 dlil_attach_protocol(struct if_proto *proto,
8581     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8582     uint32_t * proto_count)
8583 {
8584 	struct kev_dl_proto_data ev_pr_data;
8585 	struct ifnet *ifp = proto->ifp;
8586 	errno_t retval = 0;
8587 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8588 	struct if_proto *prev_proto;
8589 	struct if_proto *_proto;
8590 
8591 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8592 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8593 		return EINVAL;
8594 	}
8595 
8596 	if (!ifnet_is_attached(ifp, 1)) {
8597 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8598 		    __func__, if_name(ifp));
8599 		return ENXIO;
8600 	}
8601 	/* callee holds a proto refcnt upon success */
8602 	ifnet_lock_exclusive(ifp);
8603 	_proto = find_attached_proto(ifp, proto->protocol_family);
8604 	if (_proto != NULL) {
8605 		ifnet_lock_done(ifp);
8606 		if_proto_free(_proto);
8607 		retval = EEXIST;
8608 		goto ioref_done;
8609 	}
8610 
8611 	/*
8612 	 * Call family module add_proto routine so it can refine the
8613 	 * demux descriptors as it wishes.
8614 	 */
8615 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8616 	    demux_count);
8617 	if (retval) {
8618 		ifnet_lock_done(ifp);
8619 		goto ioref_done;
8620 	}
8621 
8622 	/*
8623 	 * Insert the protocol in the hash
8624 	 */
8625 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8626 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8627 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8628 	}
8629 	if (prev_proto) {
8630 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8631 	} else {
8632 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8633 		    proto, next_hash);
8634 	}
8635 
8636 	/* hold a proto refcnt for attach */
8637 	if_proto_ref(proto);
8638 
8639 	/*
8640 	 * The reserved field carries the number of protocol still attached
8641 	 * (subject to change)
8642 	 */
8643 	ev_pr_data.proto_family = proto->protocol_family;
8644 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8645 
8646 	ifnet_lock_done(ifp);
8647 
8648 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8649 	    (struct net_event_data *)&ev_pr_data,
8650 	    sizeof(struct kev_dl_proto_data), FALSE);
8651 	if (proto_count != NULL) {
8652 		*proto_count = ev_pr_data.proto_remaining_count;
8653 	}
8654 ioref_done:
8655 	ifnet_decr_iorefcnt(ifp);
8656 	return retval;
8657 }
8658 
8659 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8660 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8661 {
8662 	/*
8663 	 * A protocol has been attached, mark the interface up.
8664 	 * This used to be done by configd.KernelEventMonitor, but that
8665 	 * is inherently prone to races (rdar://problem/30810208).
8666 	 */
8667 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8668 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8669 	dlil_post_sifflags_msg(ifp);
8670 #if SKYWALK
8671 	switch (protocol) {
8672 	case AF_INET:
8673 	case AF_INET6:
8674 		/* don't attach the flowswitch unless attaching IP */
8675 		dlil_attach_flowswitch_nexus(ifp);
8676 		break;
8677 	default:
8678 		break;
8679 	}
8680 #endif /* SKYWALK */
8681 }
8682 
8683 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8684 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8685     const struct ifnet_attach_proto_param *proto_details)
8686 {
8687 	int retval = 0;
8688 	struct if_proto  *ifproto = NULL;
8689 	uint32_t proto_count = 0;
8690 
8691 	ifnet_head_lock_shared();
8692 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8693 		retval = EINVAL;
8694 		goto end;
8695 	}
8696 	/* Check that the interface is in the global list */
8697 	if (!ifnet_lookup(ifp)) {
8698 		retval = ENXIO;
8699 		goto end;
8700 	}
8701 
8702 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8703 
8704 	/* refcnt held above during lookup */
8705 	ifproto->ifp = ifp;
8706 	ifproto->protocol_family = protocol;
8707 	ifproto->proto_kpi = kProtoKPI_v1;
8708 	ifproto->kpi.v1.input = proto_details->input;
8709 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8710 	ifproto->kpi.v1.event = proto_details->event;
8711 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8712 	ifproto->kpi.v1.detached = proto_details->detached;
8713 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8714 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8715 
8716 	retval = dlil_attach_protocol(ifproto,
8717 	    proto_details->demux_list, proto_details->demux_count,
8718 	    &proto_count);
8719 
8720 end:
8721 	if (retval == EEXIST) {
8722 		/* already attached */
8723 		if (dlil_verbose) {
8724 			DLIL_PRINTF("%s: protocol %d already attached\n",
8725 			    ifp != NULL ? if_name(ifp) : "N/A",
8726 			    protocol);
8727 		}
8728 	} else if (retval != 0) {
8729 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8730 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8731 	} else if (dlil_verbose) {
8732 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8733 		    ifp != NULL ? if_name(ifp) : "N/A",
8734 		    protocol, proto_count);
8735 	}
8736 	ifnet_head_done();
8737 	if (retval == 0) {
8738 		dlil_handle_proto_attach(ifp, protocol);
8739 	} else if (ifproto != NULL) {
8740 		zfree(dlif_proto_zone, ifproto);
8741 	}
8742 	return retval;
8743 }
8744 
8745 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8746 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8747     const struct ifnet_attach_proto_param_v2 *proto_details)
8748 {
8749 	int retval = 0;
8750 	struct if_proto  *ifproto = NULL;
8751 	uint32_t proto_count = 0;
8752 
8753 	ifnet_head_lock_shared();
8754 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8755 		retval = EINVAL;
8756 		goto end;
8757 	}
8758 	/* Check that the interface is in the global list */
8759 	if (!ifnet_lookup(ifp)) {
8760 		retval = ENXIO;
8761 		goto end;
8762 	}
8763 
8764 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8765 
8766 	/* refcnt held above during lookup */
8767 	ifproto->ifp = ifp;
8768 	ifproto->protocol_family = protocol;
8769 	ifproto->proto_kpi = kProtoKPI_v2;
8770 	ifproto->kpi.v2.input = proto_details->input;
8771 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8772 	ifproto->kpi.v2.event = proto_details->event;
8773 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8774 	ifproto->kpi.v2.detached = proto_details->detached;
8775 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8776 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8777 
8778 	retval = dlil_attach_protocol(ifproto,
8779 	    proto_details->demux_list, proto_details->demux_count,
8780 	    &proto_count);
8781 
8782 end:
8783 	if (retval == EEXIST) {
8784 		/* already attached */
8785 		if (dlil_verbose) {
8786 			DLIL_PRINTF("%s: protocol %d already attached\n",
8787 			    ifp != NULL ? if_name(ifp) : "N/A",
8788 			    protocol);
8789 		}
8790 	} else if (retval != 0) {
8791 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8792 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8793 	} else if (dlil_verbose) {
8794 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8795 		    ifp != NULL ? if_name(ifp) : "N/A",
8796 		    protocol, proto_count);
8797 	}
8798 	ifnet_head_done();
8799 	if (retval == 0) {
8800 		dlil_handle_proto_attach(ifp, protocol);
8801 	} else if (ifproto != NULL) {
8802 		zfree(dlif_proto_zone, ifproto);
8803 	}
8804 	return retval;
8805 }
8806 
8807 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8808 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8809 {
8810 	struct if_proto *proto = NULL;
8811 	int     retval = 0;
8812 
8813 	if (ifp == NULL || proto_family == 0) {
8814 		retval = EINVAL;
8815 		goto end;
8816 	}
8817 
8818 	ifnet_lock_exclusive(ifp);
8819 	/* callee holds a proto refcnt upon success */
8820 	proto = find_attached_proto(ifp, proto_family);
8821 	if (proto == NULL) {
8822 		retval = ENXIO;
8823 		ifnet_lock_done(ifp);
8824 		goto end;
8825 	}
8826 
8827 	/* call family module del_proto */
8828 	if (ifp->if_del_proto) {
8829 		ifp->if_del_proto(ifp, proto->protocol_family);
8830 	}
8831 
8832 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8833 	    proto, if_proto, next_hash);
8834 
8835 	if (proto->proto_kpi == kProtoKPI_v1) {
8836 		proto->kpi.v1.input = ifproto_media_input_v1;
8837 		proto->kpi.v1.pre_output = ifproto_media_preout;
8838 		proto->kpi.v1.event = ifproto_media_event;
8839 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8840 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8841 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8842 	} else {
8843 		proto->kpi.v2.input = ifproto_media_input_v2;
8844 		proto->kpi.v2.pre_output = ifproto_media_preout;
8845 		proto->kpi.v2.event = ifproto_media_event;
8846 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8847 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8848 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8849 	}
8850 	proto->detached = 1;
8851 	ifnet_lock_done(ifp);
8852 
8853 	if (dlil_verbose) {
8854 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8855 		    (proto->proto_kpi == kProtoKPI_v1) ?
8856 		    "v1" : "v2", proto_family);
8857 	}
8858 
8859 	/* release proto refcnt held during protocol attach */
8860 	if_proto_free(proto);
8861 
8862 	/*
8863 	 * Release proto refcnt held during lookup; the rest of
8864 	 * protocol detach steps will happen when the last proto
8865 	 * reference is released.
8866 	 */
8867 	if_proto_free(proto);
8868 
8869 end:
8870 	return retval;
8871 }
8872 
8873 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8874 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8875     struct mbuf *packet, char *header)
8876 {
8877 #pragma unused(ifp, protocol, packet, header)
8878 	return ENXIO;
8879 }
8880 
8881 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8882 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8883     struct mbuf *packet)
8884 {
8885 #pragma unused(ifp, protocol, packet)
8886 	return ENXIO;
8887 }
8888 
8889 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8890 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8891     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8892     char *link_layer_dest)
8893 {
8894 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8895 	return ENXIO;
8896 }
8897 
8898 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8899 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8900     const struct kev_msg *event)
8901 {
8902 #pragma unused(ifp, protocol, event)
8903 }
8904 
8905 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8906 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8907     unsigned long command, void *argument)
8908 {
8909 #pragma unused(ifp, protocol, command, argument)
8910 	return ENXIO;
8911 }
8912 
8913 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8914 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8915     struct sockaddr_dl *out_ll, size_t ll_len)
8916 {
8917 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8918 	return ENXIO;
8919 }
8920 
8921 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8922 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8923     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8924     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8925 {
8926 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8927 	return ENXIO;
8928 }
8929 
8930 extern int if_next_index(void);
8931 extern int tcp_ecn_outbound;
8932 
8933 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8934 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8935 {
8936 	uint32_t sflags = 0;
8937 	int err;
8938 
8939 	if (if_flowadv) {
8940 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8941 	}
8942 
8943 	if (if_delaybased_queue) {
8944 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8945 	}
8946 
8947 	if (ifp->if_output_sched_model ==
8948 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8949 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8950 	}
8951 	/* Inherit drop limit from the default queue */
8952 	if (ifp->if_snd != ifcq) {
8953 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8954 	}
8955 	/* Initialize transmit queue(s) */
8956 	err = ifclassq_setup(ifcq, ifp, sflags);
8957 	if (err != 0) {
8958 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8959 		    "err=%d", __func__, ifp, err);
8960 		/* NOTREACHED */
8961 	}
8962 }
8963 
8964 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8965 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8966 {
8967 #if SKYWALK
8968 	boolean_t netif_compat;
8969 	if_nexus_netif  nexus_netif;
8970 #endif /* SKYWALK */
8971 	struct ifnet *tmp_if;
8972 	struct ifaddr *ifa;
8973 	struct if_data_internal if_data_saved;
8974 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8975 	struct dlil_threading_info *dl_inp;
8976 	thread_continue_t thfunc = NULL;
8977 	int err;
8978 
8979 	if (ifp == NULL) {
8980 		return EINVAL;
8981 	}
8982 
8983 	/*
8984 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8985 	 * prevent the interface from being configured while it is
8986 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8987 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8988 	 */
8989 	dlil_if_lock();
8990 	ifnet_head_lock_exclusive();
8991 	/* Verify we aren't already on the list */
8992 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8993 		if (tmp_if == ifp) {
8994 			ifnet_head_done();
8995 			dlil_if_unlock();
8996 			return EEXIST;
8997 		}
8998 	}
8999 
9000 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9001 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
9002 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
9003 		    __func__, ifp);
9004 		/* NOTREACHED */
9005 	}
9006 	lck_mtx_unlock(&ifp->if_ref_lock);
9007 
9008 	ifnet_lock_exclusive(ifp);
9009 
9010 	/* Sanity check */
9011 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9012 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9013 	VERIFY(ifp->if_threads_pending == 0);
9014 
9015 	if (ll_addr != NULL) {
9016 		if (ifp->if_addrlen == 0) {
9017 			ifp->if_addrlen = ll_addr->sdl_alen;
9018 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9019 			ifnet_lock_done(ifp);
9020 			ifnet_head_done();
9021 			dlil_if_unlock();
9022 			return EINVAL;
9023 		}
9024 	}
9025 
9026 	/*
9027 	 * Allow interfaces without protocol families to attach
9028 	 * only if they have the necessary fields filled out.
9029 	 */
9030 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9031 		DLIL_PRINTF("%s: Attempt to attach interface without "
9032 		    "family module - %d\n", __func__, ifp->if_family);
9033 		ifnet_lock_done(ifp);
9034 		ifnet_head_done();
9035 		dlil_if_unlock();
9036 		return ENODEV;
9037 	}
9038 
9039 	/* Allocate protocol hash table */
9040 	VERIFY(ifp->if_proto_hash == NULL);
9041 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9042 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9043 
9044 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9045 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9046 	TAILQ_INIT(&ifp->if_flt_head);
9047 	VERIFY(ifp->if_flt_busy == 0);
9048 	VERIFY(ifp->if_flt_waiters == 0);
9049 	VERIFY(ifp->if_flt_non_os_count == 0);
9050 	VERIFY(ifp->if_flt_no_tso_count == 0);
9051 	lck_mtx_unlock(&ifp->if_flt_lock);
9052 
9053 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9054 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9055 		LIST_INIT(&ifp->if_multiaddrs);
9056 	}
9057 
9058 	VERIFY(ifp->if_allhostsinm == NULL);
9059 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9060 	TAILQ_INIT(&ifp->if_addrhead);
9061 
9062 	if (ifp->if_index == 0) {
9063 		int idx = if_next_index();
9064 
9065 		/*
9066 		 * Since we exhausted the list of
9067 		 * if_index's, try to find an empty slot
9068 		 * in ifindex2ifnet.
9069 		 */
9070 		if (idx == -1 && if_index >= UINT16_MAX) {
9071 			for (int i = 1; i < if_index; i++) {
9072 				if (ifindex2ifnet[i] == NULL &&
9073 				    ifnet_addrs[i - 1] == NULL) {
9074 					idx = i;
9075 					break;
9076 				}
9077 			}
9078 		}
9079 		if (idx == -1) {
9080 			ifp->if_index = 0;
9081 			ifnet_lock_done(ifp);
9082 			ifnet_head_done();
9083 			dlil_if_unlock();
9084 			return ENOBUFS;
9085 		}
9086 		ifp->if_index = (uint16_t)idx;
9087 
9088 		/* the lladdr passed at attach time is the permanent address */
9089 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9090 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9091 			bcopy(CONST_LLADDR(ll_addr),
9092 			    dl_if->dl_if_permanent_ether,
9093 			    ETHER_ADDR_LEN);
9094 			dl_if->dl_if_permanent_ether_is_set = 1;
9095 		}
9096 	}
9097 	/* There should not be anything occupying this slot */
9098 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9099 
9100 	/* allocate (if needed) and initialize a link address */
9101 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
9102 	if (ifa == NULL) {
9103 		ifnet_lock_done(ifp);
9104 		ifnet_head_done();
9105 		dlil_if_unlock();
9106 		return ENOBUFS;
9107 	}
9108 
9109 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9110 	ifnet_addrs[ifp->if_index - 1] = ifa;
9111 
9112 	/* make this address the first on the list */
9113 	IFA_LOCK(ifa);
9114 	/* hold a reference for ifnet_addrs[] */
9115 	IFA_ADDREF_LOCKED(ifa);
9116 	/* if_attach_link_ifa() holds a reference for ifa_link */
9117 	if_attach_link_ifa(ifp, ifa);
9118 	IFA_UNLOCK(ifa);
9119 
9120 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9121 	ifindex2ifnet[ifp->if_index] = ifp;
9122 
9123 	/* Hold a reference to the underlying dlil_ifnet */
9124 	ifnet_reference(ifp);
9125 
9126 	/* Clear stats (save and restore other fields that we care) */
9127 	if_data_saved = ifp->if_data;
9128 	bzero(&ifp->if_data, sizeof(ifp->if_data));
9129 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
9130 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9131 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9132 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9133 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9134 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9135 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9136 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9137 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9138 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9139 	ifnet_touch_lastchange(ifp);
9140 
9141 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9142 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9143 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9144 
9145 	dlil_ifclassq_setup(ifp, ifp->if_snd);
9146 
9147 	/* Sanity checks on the input thread storage */
9148 	dl_inp = &dl_if->dl_if_inpstorage;
9149 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9150 	VERIFY(dl_inp->dlth_flags == 0);
9151 	VERIFY(dl_inp->dlth_wtot == 0);
9152 	VERIFY(dl_inp->dlth_ifp == NULL);
9153 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9154 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9155 	VERIFY(!dl_inp->dlth_affinity);
9156 	VERIFY(ifp->if_inp == NULL);
9157 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9158 	VERIFY(dl_inp->dlth_strategy == NULL);
9159 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9160 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9161 	VERIFY(dl_inp->dlth_affinity_tag == 0);
9162 
9163 #if IFNET_INPUT_SANITY_CHK
9164 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
9165 #endif /* IFNET_INPUT_SANITY_CHK */
9166 
9167 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9168 	dlil_reset_rxpoll_params(ifp);
9169 	/*
9170 	 * A specific DLIL input thread is created per non-loopback interface.
9171 	 */
9172 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9173 		ifp->if_inp = dl_inp;
9174 		ifnet_incr_pending_thread_count(ifp);
9175 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9176 		if (err == ENODEV) {
9177 			VERIFY(thfunc == NULL);
9178 			ifnet_decr_pending_thread_count(ifp);
9179 		} else if (err != 0) {
9180 			panic_plain("%s: ifp=%p couldn't get an input thread; "
9181 			    "err=%d", __func__, ifp, err);
9182 			/* NOTREACHED */
9183 		}
9184 	}
9185 	/*
9186 	 * If the driver supports the new transmit model, calculate flow hash
9187 	 * and create a workloop starter thread to invoke the if_start callback
9188 	 * where the packets may be dequeued and transmitted.
9189 	 */
9190 	if (ifp->if_eflags & IFEF_TXSTART) {
9191 		thread_precedence_policy_data_t info;
9192 		__unused kern_return_t kret;
9193 
9194 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9195 		VERIFY(ifp->if_flowhash != 0);
9196 		VERIFY(ifp->if_start_thread == THREAD_NULL);
9197 
9198 		ifnet_set_start_cycle(ifp, NULL);
9199 		ifp->if_start_pacemaker_time = 0;
9200 		ifp->if_start_active = 0;
9201 		ifp->if_start_req = 0;
9202 		ifp->if_start_flags = 0;
9203 		VERIFY(ifp->if_start != NULL);
9204 		ifnet_incr_pending_thread_count(ifp);
9205 		if ((err = kernel_thread_start(ifnet_start_thread_func,
9206 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9207 			panic_plain("%s: "
9208 			    "ifp=%p couldn't get a start thread; "
9209 			    "err=%d", __func__, ifp, err);
9210 			/* NOTREACHED */
9211 		}
9212 		bzero(&info, sizeof(info));
9213 		info.importance = 1;
9214 		kret = thread_policy_set(ifp->if_start_thread,
9215 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9216 		    THREAD_PRECEDENCE_POLICY_COUNT);
9217 		ASSERT(kret == KERN_SUCCESS);
9218 	} else {
9219 		ifp->if_flowhash = 0;
9220 	}
9221 
9222 	/* Reset polling parameters */
9223 	ifnet_set_poll_cycle(ifp, NULL);
9224 	ifp->if_poll_update = 0;
9225 	ifp->if_poll_flags = 0;
9226 	ifp->if_poll_req = 0;
9227 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9228 
9229 	/*
9230 	 * If the driver supports the new receive model, create a poller
9231 	 * thread to invoke if_input_poll callback where the packets may
9232 	 * be dequeued from the driver and processed for reception.
9233 	 * if the interface is netif compat then the poller thread is
9234 	 * managed by netif.
9235 	 */
9236 	if (thfunc == dlil_rxpoll_input_thread_func) {
9237 		thread_precedence_policy_data_t info;
9238 		__unused kern_return_t kret;
9239 #if SKYWALK
9240 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9241 #endif /* SKYWALK */
9242 		VERIFY(ifp->if_input_poll != NULL);
9243 		VERIFY(ifp->if_input_ctl != NULL);
9244 		ifnet_incr_pending_thread_count(ifp);
9245 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9246 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
9247 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
9248 			    "err=%d", __func__, ifp, err);
9249 			/* NOTREACHED */
9250 		}
9251 		bzero(&info, sizeof(info));
9252 		info.importance = 1;
9253 		kret = thread_policy_set(ifp->if_poll_thread,
9254 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9255 		    THREAD_PRECEDENCE_POLICY_COUNT);
9256 		ASSERT(kret == KERN_SUCCESS);
9257 	}
9258 
9259 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9260 	VERIFY(ifp->if_desc.ifd_len == 0);
9261 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9262 
9263 	/* Record attach PC stacktrace */
9264 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9265 
9266 	ifp->if_updatemcasts = 0;
9267 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9268 		struct ifmultiaddr *ifma;
9269 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9270 			IFMA_LOCK(ifma);
9271 			if (ifma->ifma_addr->sa_family == AF_LINK ||
9272 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
9273 				ifp->if_updatemcasts++;
9274 			}
9275 			IFMA_UNLOCK(ifma);
9276 		}
9277 
9278 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9279 		    "membership(s)\n", if_name(ifp),
9280 		    ifp->if_updatemcasts);
9281 	}
9282 
9283 	/* Clear logging parameters */
9284 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9285 
9286 	/* Clear foreground/realtime activity timestamps */
9287 	ifp->if_fg_sendts = 0;
9288 	ifp->if_rt_sendts = 0;
9289 
9290 	/* Clear throughput estimates and radio type */
9291 	ifp->if_estimated_up_bucket = 0;
9292 	ifp->if_estimated_down_bucket = 0;
9293 	ifp->if_radio_type = 0;
9294 	ifp->if_radio_channel = 0;
9295 
9296 	VERIFY(ifp->if_delegated.ifp == NULL);
9297 	VERIFY(ifp->if_delegated.type == 0);
9298 	VERIFY(ifp->if_delegated.family == 0);
9299 	VERIFY(ifp->if_delegated.subfamily == 0);
9300 	VERIFY(ifp->if_delegated.expensive == 0);
9301 	VERIFY(ifp->if_delegated.constrained == 0);
9302 
9303 	VERIFY(ifp->if_agentids == NULL);
9304 	VERIFY(ifp->if_agentcount == 0);
9305 
9306 	/* Reset interface state */
9307 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9308 	ifp->if_interface_state.valid_bitmask |=
9309 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9310 	ifp->if_interface_state.interface_availability =
9311 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9312 
9313 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
9314 	if (ifp == lo_ifp) {
9315 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9316 		ifp->if_interface_state.valid_bitmask |=
9317 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
9318 	} else {
9319 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9320 	}
9321 
9322 	/*
9323 	 * Enable ECN capability on this interface depending on the
9324 	 * value of ECN global setting
9325 	 */
9326 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9327 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
9328 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9329 	}
9330 
9331 	/*
9332 	 * Built-in Cyclops always on policy for WiFi infra
9333 	 */
9334 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9335 		errno_t error;
9336 
9337 		error = if_set_qosmarking_mode(ifp,
9338 		    IFRTYPE_QOSMARKING_FASTLANE);
9339 		if (error != 0) {
9340 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9341 			    __func__, ifp->if_xname, error);
9342 		} else {
9343 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9344 #if (DEVELOPMENT || DEBUG)
9345 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9346 			    __func__, ifp->if_xname);
9347 #endif /* (DEVELOPMENT || DEBUG) */
9348 		}
9349 	}
9350 
9351 	ifnet_lock_done(ifp);
9352 	ifnet_head_done();
9353 
9354 #if SKYWALK
9355 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9356 #endif /* SKYWALK */
9357 
9358 	lck_mtx_lock(&ifp->if_cached_route_lock);
9359 	/* Enable forwarding cached route */
9360 	ifp->if_fwd_cacheok = 1;
9361 	/* Clean up any existing cached routes */
9362 	ROUTE_RELEASE(&ifp->if_fwd_route);
9363 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9364 	ROUTE_RELEASE(&ifp->if_src_route);
9365 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9366 	ROUTE_RELEASE(&ifp->if_src_route6);
9367 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9368 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9369 
9370 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9371 
9372 	/*
9373 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9374 	 * and trees; do this before the ifnet is marked as attached.
9375 	 * The ifnet keeps the reference to the info structures even after
9376 	 * the ifnet is detached, since the network-layer records still
9377 	 * refer to the info structures even after that.  This also
9378 	 * makes it possible for them to still function after the ifnet
9379 	 * is recycled or reattached.
9380 	 */
9381 #if INET
9382 	if (IGMP_IFINFO(ifp) == NULL) {
9383 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9384 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9385 	} else {
9386 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9387 		igmp_domifreattach(IGMP_IFINFO(ifp));
9388 	}
9389 #endif /* INET */
9390 	if (MLD_IFINFO(ifp) == NULL) {
9391 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9392 		VERIFY(MLD_IFINFO(ifp) != NULL);
9393 	} else {
9394 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9395 		mld_domifreattach(MLD_IFINFO(ifp));
9396 	}
9397 
9398 	VERIFY(ifp->if_data_threshold == 0);
9399 	VERIFY(ifp->if_dt_tcall != NULL);
9400 
9401 	/*
9402 	 * Wait for the created kernel threads for I/O to get
9403 	 * scheduled and run at least once before we proceed
9404 	 * to mark interface as attached.
9405 	 */
9406 	lck_mtx_lock(&ifp->if_ref_lock);
9407 	while (ifp->if_threads_pending != 0) {
9408 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9409 		    "interface %s to get scheduled at least once.\n",
9410 		    __func__, ifp->if_xname);
9411 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9412 		    __func__, NULL);
9413 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9414 	}
9415 	lck_mtx_unlock(&ifp->if_ref_lock);
9416 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9417 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9418 
9419 	/* Final mark this ifnet as attached. */
9420 	ifnet_lock_exclusive(ifp);
9421 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9422 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9423 	lck_mtx_unlock(&ifp->if_ref_lock);
9424 	if (net_rtref) {
9425 		/* boot-args override; enable idle notification */
9426 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9427 		    IFRF_IDLE_NOTIFY);
9428 	} else {
9429 		/* apply previous request(s) to set the idle flags, if any */
9430 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9431 		    ifp->if_idle_new_flags_mask);
9432 	}
9433 #if SKYWALK
9434 	/* the interface is fully attached; let the nexus adapter know */
9435 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9436 		if (netif_compat) {
9437 			if (sk_netif_compat_txmodel ==
9438 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9439 				ifnet_enqueue_multi_setup(ifp,
9440 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9441 			}
9442 			ifp->if_nx_netif = nexus_netif;
9443 		}
9444 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9445 	}
9446 #endif /* SKYWALK */
9447 	ifnet_lock_done(ifp);
9448 	dlil_if_unlock();
9449 
9450 #if PF
9451 	/*
9452 	 * Attach packet filter to this interface, if enabled.
9453 	 */
9454 	pf_ifnet_hook(ifp, 1);
9455 #endif /* PF */
9456 
9457 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9458 
9459 	if (dlil_verbose) {
9460 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9461 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9462 	}
9463 
9464 	return 0;
9465 }
9466 
9467 /*
9468  * Prepare the storage for the first/permanent link address, which must
9469  * must have the same lifetime as the ifnet itself.  Although the link
9470  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9471  * its location in memory must never change as it may still be referred
9472  * to by some parts of the system afterwards (unfortunate implementation
9473  * artifacts inherited from BSD.)
9474  *
9475  * Caller must hold ifnet lock as writer.
9476  */
9477 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9478 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9479 {
9480 	struct ifaddr *ifa, *oifa;
9481 	struct sockaddr_dl *asdl, *msdl;
9482 	char workbuf[IFNAMSIZ * 2];
9483 	int namelen, masklen, socksize;
9484 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9485 
9486 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9487 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9488 
9489 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9490 	    if_name(ifp));
9491 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9492 	    + ((namelen > 0) ? namelen : 0);
9493 	socksize = masklen + ifp->if_addrlen;
9494 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9495 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9496 		socksize = sizeof(struct sockaddr_dl);
9497 	}
9498 	socksize = ROUNDUP(socksize);
9499 #undef ROUNDUP
9500 
9501 	ifa = ifp->if_lladdr;
9502 	if (socksize > DLIL_SDLMAXLEN ||
9503 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9504 		/*
9505 		 * Rare, but in the event that the link address requires
9506 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9507 		 * largest possible storages for address and mask, such
9508 		 * that we can reuse the same space when if_addrlen grows.
9509 		 * This same space will be used when if_addrlen shrinks.
9510 		 */
9511 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9512 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9513 
9514 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9515 			ifa_lock_init(ifa);
9516 			/* Don't set IFD_ALLOC, as this is permanent */
9517 			ifa->ifa_debug = IFD_LINK;
9518 		}
9519 		IFA_LOCK(ifa);
9520 		/* address and mask sockaddr_dl locations */
9521 		asdl = (struct sockaddr_dl *)(ifa + 1);
9522 		bzero(asdl, SOCK_MAXADDRLEN);
9523 		msdl = (struct sockaddr_dl *)(void *)
9524 		    ((char *)asdl + SOCK_MAXADDRLEN);
9525 		bzero(msdl, SOCK_MAXADDRLEN);
9526 	} else {
9527 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9528 		/*
9529 		 * Use the storage areas for address and mask within the
9530 		 * dlil_ifnet structure.  This is the most common case.
9531 		 */
9532 		if (ifa == NULL) {
9533 			ifa = &dl_if->dl_if_lladdr.ifa;
9534 			ifa_lock_init(ifa);
9535 			/* Don't set IFD_ALLOC, as this is permanent */
9536 			ifa->ifa_debug = IFD_LINK;
9537 		}
9538 		IFA_LOCK(ifa);
9539 		/* address and mask sockaddr_dl locations */
9540 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9541 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9542 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9543 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9544 	}
9545 
9546 	/* hold a permanent reference for the ifnet itself */
9547 	IFA_ADDREF_LOCKED(ifa);
9548 	oifa = ifp->if_lladdr;
9549 	ifp->if_lladdr = ifa;
9550 
9551 	VERIFY(ifa->ifa_debug == IFD_LINK);
9552 	ifa->ifa_ifp = ifp;
9553 	ifa->ifa_rtrequest = link_rtrequest;
9554 	ifa->ifa_addr = (struct sockaddr *)asdl;
9555 	asdl->sdl_len = (u_char)socksize;
9556 	asdl->sdl_family = AF_LINK;
9557 	if (namelen > 0) {
9558 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9559 		    sizeof(asdl->sdl_data)));
9560 		asdl->sdl_nlen = (u_char)namelen;
9561 	} else {
9562 		asdl->sdl_nlen = 0;
9563 	}
9564 	asdl->sdl_index = ifp->if_index;
9565 	asdl->sdl_type = ifp->if_type;
9566 	if (ll_addr != NULL) {
9567 		asdl->sdl_alen = ll_addr->sdl_alen;
9568 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9569 	} else {
9570 		asdl->sdl_alen = 0;
9571 	}
9572 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9573 	msdl->sdl_len = (u_char)masklen;
9574 	while (namelen > 0) {
9575 		msdl->sdl_data[--namelen] = 0xff;
9576 	}
9577 	IFA_UNLOCK(ifa);
9578 
9579 	if (oifa != NULL) {
9580 		IFA_REMREF(oifa);
9581 	}
9582 
9583 	return ifa;
9584 }
9585 
9586 static void
if_purgeaddrs(struct ifnet * ifp)9587 if_purgeaddrs(struct ifnet *ifp)
9588 {
9589 #if INET
9590 	in_purgeaddrs(ifp);
9591 #endif /* INET */
9592 	in6_purgeaddrs(ifp);
9593 }
9594 
9595 errno_t
ifnet_detach(ifnet_t ifp)9596 ifnet_detach(ifnet_t ifp)
9597 {
9598 	struct ifnet *delegated_ifp;
9599 	struct nd_ifinfo *ndi = NULL;
9600 
9601 	if (ifp == NULL) {
9602 		return EINVAL;
9603 	}
9604 
9605 	ndi = ND_IFINFO(ifp);
9606 	if (NULL != ndi) {
9607 		ndi->cga_initialized = FALSE;
9608 	}
9609 
9610 	/* Mark the interface down */
9611 	if_down(ifp);
9612 
9613 	/*
9614 	 * IMPORTANT NOTE
9615 	 *
9616 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9617 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9618 	 * until after we've waited for all I/O references to drain
9619 	 * in ifnet_detach_final().
9620 	 */
9621 
9622 	ifnet_head_lock_exclusive();
9623 	ifnet_lock_exclusive(ifp);
9624 
9625 	if (ifp->if_output_netem != NULL) {
9626 		netem_destroy(ifp->if_output_netem);
9627 		ifp->if_output_netem = NULL;
9628 	}
9629 
9630 	/*
9631 	 * Check to see if this interface has previously triggered
9632 	 * aggressive protocol draining; if so, decrement the global
9633 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9634 	 * there are no more of such an interface around.
9635 	 */
9636 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9637 
9638 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9639 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9640 		lck_mtx_unlock(&ifp->if_ref_lock);
9641 		ifnet_lock_done(ifp);
9642 		ifnet_head_done();
9643 		return EINVAL;
9644 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9645 		/* Interface has already been detached */
9646 		lck_mtx_unlock(&ifp->if_ref_lock);
9647 		ifnet_lock_done(ifp);
9648 		ifnet_head_done();
9649 		return ENXIO;
9650 	}
9651 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9652 	/* Indicate this interface is being detached */
9653 	ifp->if_refflags &= ~IFRF_ATTACHED;
9654 	ifp->if_refflags |= IFRF_DETACHING;
9655 	lck_mtx_unlock(&ifp->if_ref_lock);
9656 
9657 	if (dlil_verbose) {
9658 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9659 	}
9660 
9661 	/* clean up flow control entry object if there's any */
9662 	if (ifp->if_eflags & IFEF_TXSTART) {
9663 		ifnet_flowadv(ifp->if_flowhash);
9664 	}
9665 
9666 	/* Reset ECN enable/disable flags */
9667 	/* Reset CLAT46 flag */
9668 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9669 
9670 	/*
9671 	 * We do not reset the TCP keep alive counters in case
9672 	 * a TCP connection stays connection after the interface
9673 	 * went down
9674 	 */
9675 	if (ifp->if_tcp_kao_cnt > 0) {
9676 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9677 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9678 	}
9679 	ifp->if_tcp_kao_max = 0;
9680 
9681 	/*
9682 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9683 	 * no longer be visible during lookups from this point.
9684 	 */
9685 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9686 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9687 	ifp->if_link.tqe_next = NULL;
9688 	ifp->if_link.tqe_prev = NULL;
9689 	if (ifp->if_ordered_link.tqe_next != NULL ||
9690 	    ifp->if_ordered_link.tqe_prev != NULL) {
9691 		ifnet_remove_from_ordered_list(ifp);
9692 	}
9693 	ifindex2ifnet[ifp->if_index] = NULL;
9694 
9695 	/* 18717626 - reset router mode */
9696 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9697 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9698 
9699 	/* Record detach PC stacktrace */
9700 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9701 
9702 	/* Clear logging parameters */
9703 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9704 
9705 	/* Clear delegated interface info (reference released below) */
9706 	delegated_ifp = ifp->if_delegated.ifp;
9707 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9708 
9709 	/* Reset interface state */
9710 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9711 
9712 	/*
9713 	 * Increment the generation count on interface deletion
9714 	 */
9715 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9716 
9717 	ifnet_lock_done(ifp);
9718 	ifnet_head_done();
9719 
9720 	/* Release reference held on the delegated interface */
9721 	if (delegated_ifp != NULL) {
9722 		ifnet_release(delegated_ifp);
9723 	}
9724 
9725 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9726 	if (ifp != lo_ifp) {
9727 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9728 	}
9729 
9730 	/* Reset TCP local statistics */
9731 	if (ifp->if_tcp_stat != NULL) {
9732 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9733 	}
9734 
9735 	/* Reset UDP local statistics */
9736 	if (ifp->if_udp_stat != NULL) {
9737 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9738 	}
9739 
9740 	/* Reset ifnet IPv4 stats */
9741 	if (ifp->if_ipv4_stat != NULL) {
9742 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9743 	}
9744 
9745 	/* Reset ifnet IPv6 stats */
9746 	if (ifp->if_ipv6_stat != NULL) {
9747 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9748 	}
9749 
9750 	/* Release memory held for interface link status report */
9751 	if (ifp->if_link_status != NULL) {
9752 		kfree_type(struct if_link_status, ifp->if_link_status);
9753 		ifp->if_link_status = NULL;
9754 	}
9755 
9756 	/* Disable forwarding cached route */
9757 	lck_mtx_lock(&ifp->if_cached_route_lock);
9758 	ifp->if_fwd_cacheok = 0;
9759 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9760 
9761 	/* Disable data threshold and wait for any pending event posting */
9762 	ifp->if_data_threshold = 0;
9763 	VERIFY(ifp->if_dt_tcall != NULL);
9764 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9765 
9766 	/*
9767 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9768 	 * references to the info structures and leave them attached to
9769 	 * this ifnet.
9770 	 */
9771 #if INET
9772 	igmp_domifdetach(ifp);
9773 #endif /* INET */
9774 	mld_domifdetach(ifp);
9775 
9776 #if SKYWALK
9777 	/* Clean up any netns tokens still pointing to to this ifnet */
9778 	netns_ifnet_detach(ifp);
9779 #endif /* SKYWALK */
9780 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9781 
9782 	/* Let worker thread take care of the rest, to avoid reentrancy */
9783 	dlil_if_lock();
9784 	ifnet_detaching_enqueue(ifp);
9785 	dlil_if_unlock();
9786 
9787 	return 0;
9788 }
9789 
9790 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9791 ifnet_detaching_enqueue(struct ifnet *ifp)
9792 {
9793 	dlil_if_lock_assert();
9794 
9795 	++ifnet_detaching_cnt;
9796 	VERIFY(ifnet_detaching_cnt != 0);
9797 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9798 	wakeup((caddr_t)&ifnet_delayed_run);
9799 }
9800 
9801 static struct ifnet *
ifnet_detaching_dequeue(void)9802 ifnet_detaching_dequeue(void)
9803 {
9804 	struct ifnet *ifp;
9805 
9806 	dlil_if_lock_assert();
9807 
9808 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9809 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9810 	if (ifp != NULL) {
9811 		VERIFY(ifnet_detaching_cnt != 0);
9812 		--ifnet_detaching_cnt;
9813 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9814 		ifp->if_detaching_link.tqe_next = NULL;
9815 		ifp->if_detaching_link.tqe_prev = NULL;
9816 	}
9817 	return ifp;
9818 }
9819 
9820 __attribute__((noreturn))
9821 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9822 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9823 {
9824 #pragma unused(v, wres)
9825 	struct ifnet *ifp;
9826 
9827 	dlil_if_lock();
9828 	if (__improbable(ifnet_detaching_embryonic)) {
9829 		ifnet_detaching_embryonic = FALSE;
9830 		/* there's no lock ordering constrain so OK to do this here */
9831 		dlil_decr_pending_thread_count();
9832 	}
9833 
9834 	for (;;) {
9835 		dlil_if_lock_assert();
9836 
9837 		if (ifnet_detaching_cnt == 0) {
9838 			break;
9839 		}
9840 
9841 		net_update_uptime();
9842 
9843 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9844 
9845 		/* Take care of detaching ifnet */
9846 		ifp = ifnet_detaching_dequeue();
9847 		if (ifp != NULL) {
9848 			dlil_if_unlock();
9849 			ifnet_detach_final(ifp);
9850 			dlil_if_lock();
9851 		}
9852 	}
9853 
9854 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9855 	dlil_if_unlock();
9856 	(void) thread_block(ifnet_detacher_thread_cont);
9857 
9858 	VERIFY(0);      /* we should never get here */
9859 	/* NOTREACHED */
9860 	__builtin_unreachable();
9861 }
9862 
9863 __dead2
9864 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9865 ifnet_detacher_thread_func(void *v, wait_result_t w)
9866 {
9867 #pragma unused(v, w)
9868 	dlil_if_lock();
9869 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9870 	ifnet_detaching_embryonic = TRUE;
9871 	/* wake up once to get out of embryonic state */
9872 	wakeup((caddr_t)&ifnet_delayed_run);
9873 	dlil_if_unlock();
9874 	(void) thread_block(ifnet_detacher_thread_cont);
9875 	VERIFY(0);
9876 	/* NOTREACHED */
9877 	__builtin_unreachable();
9878 }
9879 
9880 static void
ifnet_detach_final(struct ifnet * ifp)9881 ifnet_detach_final(struct ifnet *ifp)
9882 {
9883 	struct ifnet_filter *filter, *filter_next;
9884 	struct dlil_ifnet *dlifp;
9885 	struct ifnet_filter_head fhead;
9886 	struct dlil_threading_info *inp;
9887 	struct ifaddr *ifa;
9888 	ifnet_detached_func if_free;
9889 	int i;
9890 
9891 	/* Let BPF know we're detaching */
9892 	bpfdetach(ifp);
9893 
9894 #if SKYWALK
9895 	dlil_netif_detach_notify(ifp);
9896 	/*
9897 	 * Wait for the datapath to quiesce before tearing down
9898 	 * netif/flowswitch nexuses.
9899 	 */
9900 	dlil_quiesce_and_detach_nexuses(ifp);
9901 #endif /* SKYWALK */
9902 
9903 	lck_mtx_lock(&ifp->if_ref_lock);
9904 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9905 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9906 		    __func__, ifp);
9907 		/* NOTREACHED */
9908 	}
9909 
9910 	/*
9911 	 * Wait until the existing IO references get released
9912 	 * before we proceed with ifnet_detach.  This is not a
9913 	 * common case, so block without using a continuation.
9914 	 */
9915 	while (ifp->if_refio > 0) {
9916 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9917 		    "to be released\n", __func__, if_name(ifp));
9918 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9919 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9920 	}
9921 
9922 	VERIFY(ifp->if_datamov == 0);
9923 	VERIFY(ifp->if_drainers == 0);
9924 	VERIFY(ifp->if_suspend == 0);
9925 	ifp->if_refflags &= ~IFRF_READY;
9926 	lck_mtx_unlock(&ifp->if_ref_lock);
9927 
9928 	/* Clear agent IDs */
9929 	if (ifp->if_agentids != NULL) {
9930 		kfree_data(ifp->if_agentids,
9931 		    sizeof(uuid_t) * ifp->if_agentcount);
9932 		ifp->if_agentids = NULL;
9933 	}
9934 	ifp->if_agentcount = 0;
9935 
9936 #if SKYWALK
9937 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9938 #endif /* SKYWALK */
9939 	/* Drain and destroy send queue */
9940 	ifclassq_teardown(ifp->if_snd);
9941 
9942 	/* Detach interface filters */
9943 	lck_mtx_lock(&ifp->if_flt_lock);
9944 	if_flt_monitor_enter(ifp);
9945 
9946 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9947 	fhead = ifp->if_flt_head;
9948 	TAILQ_INIT(&ifp->if_flt_head);
9949 
9950 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9951 		filter_next = TAILQ_NEXT(filter, filt_next);
9952 		lck_mtx_unlock(&ifp->if_flt_lock);
9953 
9954 		dlil_detach_filter_internal(filter, 1);
9955 		lck_mtx_lock(&ifp->if_flt_lock);
9956 	}
9957 	if_flt_monitor_leave(ifp);
9958 	lck_mtx_unlock(&ifp->if_flt_lock);
9959 
9960 	/* Tell upper layers to drop their network addresses */
9961 	if_purgeaddrs(ifp);
9962 
9963 	ifnet_lock_exclusive(ifp);
9964 
9965 	/* Unplumb all protocols */
9966 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9967 		struct if_proto *proto;
9968 
9969 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9970 		while (proto != NULL) {
9971 			protocol_family_t family = proto->protocol_family;
9972 			ifnet_lock_done(ifp);
9973 			proto_unplumb(family, ifp);
9974 			ifnet_lock_exclusive(ifp);
9975 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9976 		}
9977 		/* There should not be any protocols left */
9978 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9979 	}
9980 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9981 	ifp->if_proto_hash = NULL;
9982 
9983 	/* Detach (permanent) link address from if_addrhead */
9984 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9985 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9986 	IFA_LOCK(ifa);
9987 	if_detach_link_ifa(ifp, ifa);
9988 	IFA_UNLOCK(ifa);
9989 
9990 	/* Remove (permanent) link address from ifnet_addrs[] */
9991 	IFA_REMREF(ifa);
9992 	ifnet_addrs[ifp->if_index - 1] = NULL;
9993 
9994 	/* This interface should not be on {ifnet_head,detaching} */
9995 	VERIFY(ifp->if_link.tqe_next == NULL);
9996 	VERIFY(ifp->if_link.tqe_prev == NULL);
9997 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9998 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9999 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
10000 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
10001 
10002 	/* The slot should have been emptied */
10003 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
10004 
10005 	/* There should not be any addresses left */
10006 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
10007 
10008 	/*
10009 	 * Signal the starter thread to terminate itself, and wait until
10010 	 * it has exited.
10011 	 */
10012 	if (ifp->if_start_thread != THREAD_NULL) {
10013 		lck_mtx_lock_spin(&ifp->if_start_lock);
10014 		ifp->if_start_flags |= IFSF_TERMINATING;
10015 		wakeup_one((caddr_t)&ifp->if_start_thread);
10016 		lck_mtx_unlock(&ifp->if_start_lock);
10017 
10018 		/* wait for starter thread to terminate */
10019 		lck_mtx_lock(&ifp->if_start_lock);
10020 		while (ifp->if_start_thread != THREAD_NULL) {
10021 			if (dlil_verbose) {
10022 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10023 				    __func__,
10024 				    if_name(ifp));
10025 			}
10026 			(void) msleep(&ifp->if_start_thread,
10027 			    &ifp->if_start_lock, (PZERO - 1),
10028 			    "ifnet_start_thread_exit", NULL);
10029 		}
10030 		lck_mtx_unlock(&ifp->if_start_lock);
10031 		if (dlil_verbose) {
10032 			DLIL_PRINTF("%s: %s starter thread termination complete",
10033 			    __func__, if_name(ifp));
10034 		}
10035 	}
10036 
10037 	/*
10038 	 * Signal the poller thread to terminate itself, and wait until
10039 	 * it has exited.
10040 	 */
10041 	if (ifp->if_poll_thread != THREAD_NULL) {
10042 #if SKYWALK
10043 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10044 #endif /* SKYWALK */
10045 		lck_mtx_lock_spin(&ifp->if_poll_lock);
10046 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10047 		wakeup_one((caddr_t)&ifp->if_poll_thread);
10048 		lck_mtx_unlock(&ifp->if_poll_lock);
10049 
10050 		/* wait for poller thread to terminate */
10051 		lck_mtx_lock(&ifp->if_poll_lock);
10052 		while (ifp->if_poll_thread != THREAD_NULL) {
10053 			if (dlil_verbose) {
10054 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10055 				    __func__,
10056 				    if_name(ifp));
10057 			}
10058 			(void) msleep(&ifp->if_poll_thread,
10059 			    &ifp->if_poll_lock, (PZERO - 1),
10060 			    "ifnet_poll_thread_exit", NULL);
10061 		}
10062 		lck_mtx_unlock(&ifp->if_poll_lock);
10063 		if (dlil_verbose) {
10064 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
10065 			    __func__, if_name(ifp));
10066 		}
10067 	}
10068 
10069 	/*
10070 	 * If thread affinity was set for the workloop thread, we will need
10071 	 * to tear down the affinity and release the extra reference count
10072 	 * taken at attach time.  Does not apply to lo0 or other interfaces
10073 	 * without dedicated input threads.
10074 	 */
10075 	if ((inp = ifp->if_inp) != NULL) {
10076 		VERIFY(inp != dlil_main_input_thread);
10077 
10078 		if (inp->dlth_affinity) {
10079 			struct thread *tp, *wtp, *ptp;
10080 
10081 			lck_mtx_lock_spin(&inp->dlth_lock);
10082 			wtp = inp->dlth_driver_thread;
10083 			inp->dlth_driver_thread = THREAD_NULL;
10084 			ptp = inp->dlth_poller_thread;
10085 			inp->dlth_poller_thread = THREAD_NULL;
10086 			ASSERT(inp->dlth_thread != THREAD_NULL);
10087 			tp = inp->dlth_thread;    /* don't nullify now */
10088 			inp->dlth_affinity_tag = 0;
10089 			inp->dlth_affinity = FALSE;
10090 			lck_mtx_unlock(&inp->dlth_lock);
10091 
10092 			/* Tear down poll thread affinity */
10093 			if (ptp != NULL) {
10094 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10095 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
10096 				(void) dlil_affinity_set(ptp,
10097 				    THREAD_AFFINITY_TAG_NULL);
10098 				thread_deallocate(ptp);
10099 			}
10100 
10101 			/* Tear down workloop thread affinity */
10102 			if (wtp != NULL) {
10103 				(void) dlil_affinity_set(wtp,
10104 				    THREAD_AFFINITY_TAG_NULL);
10105 				thread_deallocate(wtp);
10106 			}
10107 
10108 			/* Tear down DLIL input thread affinity */
10109 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10110 			thread_deallocate(tp);
10111 		}
10112 
10113 		/* disassociate ifp DLIL input thread */
10114 		ifp->if_inp = NULL;
10115 
10116 		/* if the worker thread was created, tell it to terminate */
10117 		if (inp->dlth_thread != THREAD_NULL) {
10118 			lck_mtx_lock_spin(&inp->dlth_lock);
10119 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10120 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10121 				wakeup_one((caddr_t)&inp->dlth_flags);
10122 			}
10123 			lck_mtx_unlock(&inp->dlth_lock);
10124 			ifnet_lock_done(ifp);
10125 
10126 			/* wait for the input thread to terminate */
10127 			lck_mtx_lock_spin(&inp->dlth_lock);
10128 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10129 			    == 0) {
10130 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10131 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10132 			}
10133 			lck_mtx_unlock(&inp->dlth_lock);
10134 			ifnet_lock_exclusive(ifp);
10135 		}
10136 
10137 		/* clean-up input thread state */
10138 		dlil_clean_threading_info(inp);
10139 		/* clean-up poll parameters */
10140 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
10141 		dlil_reset_rxpoll_params(ifp);
10142 	}
10143 
10144 	/* The driver might unload, so point these to ourselves */
10145 	if_free = ifp->if_free;
10146 	ifp->if_output_dlil = ifp_if_output;
10147 	ifp->if_output = ifp_if_output;
10148 	ifp->if_pre_enqueue = ifp_if_output;
10149 	ifp->if_start = ifp_if_start;
10150 	ifp->if_output_ctl = ifp_if_ctl;
10151 	ifp->if_input_dlil = ifp_if_input;
10152 	ifp->if_input_poll = ifp_if_input_poll;
10153 	ifp->if_input_ctl = ifp_if_ctl;
10154 	ifp->if_ioctl = ifp_if_ioctl;
10155 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10156 	ifp->if_free = ifp_if_free;
10157 	ifp->if_demux = ifp_if_demux;
10158 	ifp->if_event = ifp_if_event;
10159 	ifp->if_framer_legacy = ifp_if_framer;
10160 	ifp->if_framer = ifp_if_framer_extended;
10161 	ifp->if_add_proto = ifp_if_add_proto;
10162 	ifp->if_del_proto = ifp_if_del_proto;
10163 	ifp->if_check_multi = ifp_if_check_multi;
10164 
10165 	/* wipe out interface description */
10166 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10167 	ifp->if_desc.ifd_len = 0;
10168 	VERIFY(ifp->if_desc.ifd_desc != NULL);
10169 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10170 
10171 	/* there shouldn't be any delegation by now */
10172 	VERIFY(ifp->if_delegated.ifp == NULL);
10173 	VERIFY(ifp->if_delegated.type == 0);
10174 	VERIFY(ifp->if_delegated.family == 0);
10175 	VERIFY(ifp->if_delegated.subfamily == 0);
10176 	VERIFY(ifp->if_delegated.expensive == 0);
10177 	VERIFY(ifp->if_delegated.constrained == 0);
10178 
10179 	/* QoS marking get cleared */
10180 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10181 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10182 
10183 #if SKYWALK
10184 	/* the nexus destructor is responsible for clearing these */
10185 	VERIFY(ifp->if_na_ops == NULL);
10186 	VERIFY(ifp->if_na == NULL);
10187 #endif /* SKYWALK */
10188 
10189 	/* promiscuous/allmulti counts need to start at zero again */
10190 	ifp->if_pcount = 0;
10191 	ifp->if_amcount = 0;
10192 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10193 
10194 	ifnet_lock_done(ifp);
10195 
10196 #if PF
10197 	/*
10198 	 * Detach this interface from packet filter, if enabled.
10199 	 */
10200 	pf_ifnet_hook(ifp, 0);
10201 #endif /* PF */
10202 
10203 	/* Filter list should be empty */
10204 	lck_mtx_lock_spin(&ifp->if_flt_lock);
10205 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10206 	VERIFY(ifp->if_flt_busy == 0);
10207 	VERIFY(ifp->if_flt_waiters == 0);
10208 	VERIFY(ifp->if_flt_non_os_count == 0);
10209 	VERIFY(ifp->if_flt_no_tso_count == 0);
10210 	lck_mtx_unlock(&ifp->if_flt_lock);
10211 
10212 	/* Last chance to drain send queue */
10213 	if_qflush_snd(ifp, 0);
10214 
10215 	/* Last chance to cleanup any cached route */
10216 	lck_mtx_lock(&ifp->if_cached_route_lock);
10217 	VERIFY(!ifp->if_fwd_cacheok);
10218 	ROUTE_RELEASE(&ifp->if_fwd_route);
10219 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10220 	ROUTE_RELEASE(&ifp->if_src_route);
10221 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10222 	ROUTE_RELEASE(&ifp->if_src_route6);
10223 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10224 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10225 
10226 	VERIFY(ifp->if_data_threshold == 0);
10227 	VERIFY(ifp->if_dt_tcall != NULL);
10228 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10229 
10230 	ifnet_llreach_ifdetach(ifp);
10231 
10232 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10233 
10234 	/*
10235 	 * Finally, mark this ifnet as detached.
10236 	 */
10237 	if (dlil_verbose) {
10238 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
10239 	}
10240 	lck_mtx_lock_spin(&ifp->if_ref_lock);
10241 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
10242 		panic("%s: flags mismatch (detaching not set) ifp=%p",
10243 		    __func__, ifp);
10244 		/* NOTREACHED */
10245 	}
10246 	ifp->if_refflags &= ~IFRF_DETACHING;
10247 	lck_mtx_unlock(&ifp->if_ref_lock);
10248 	if (if_free != NULL) {
10249 		if_free(ifp);
10250 	}
10251 
10252 	ifclassq_release(&ifp->if_snd);
10253 
10254 	/* we're fully detached, clear the "in use" bit */
10255 	dlifp = (struct dlil_ifnet *)ifp;
10256 	lck_mtx_lock(&dlifp->dl_if_lock);
10257 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10258 	dlifp->dl_if_flags &= ~DLIF_INUSE;
10259 	lck_mtx_unlock(&dlifp->dl_if_lock);
10260 
10261 	/* Release reference held during ifnet attach */
10262 	ifnet_release(ifp);
10263 }
10264 
10265 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10266 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10267 {
10268 #pragma unused(ifp)
10269 	m_freem_list(m);
10270 	return 0;
10271 }
10272 
10273 void
ifp_if_start(struct ifnet * ifp)10274 ifp_if_start(struct ifnet *ifp)
10275 {
10276 	ifnet_purge(ifp);
10277 }
10278 
10279 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10280 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10281     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10282     boolean_t poll, struct thread *tp)
10283 {
10284 #pragma unused(ifp, m_tail, s, poll, tp)
10285 	m_freem_list(m_head);
10286 	return ENXIO;
10287 }
10288 
10289 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10290 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10291     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10292 {
10293 #pragma unused(ifp, flags, max_cnt)
10294 	if (m_head != NULL) {
10295 		*m_head = NULL;
10296 	}
10297 	if (m_tail != NULL) {
10298 		*m_tail = NULL;
10299 	}
10300 	if (cnt != NULL) {
10301 		*cnt = 0;
10302 	}
10303 	if (len != NULL) {
10304 		*len = 0;
10305 	}
10306 }
10307 
10308 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10309 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10310 {
10311 #pragma unused(ifp, cmd, arglen, arg)
10312 	return EOPNOTSUPP;
10313 }
10314 
10315 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10316 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10317 {
10318 #pragma unused(ifp, fh, pf)
10319 	m_freem(m);
10320 	return EJUSTRETURN;
10321 }
10322 
10323 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10324 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10325     const struct ifnet_demux_desc *da, u_int32_t dc)
10326 {
10327 #pragma unused(ifp, pf, da, dc)
10328 	return EINVAL;
10329 }
10330 
10331 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10332 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10333 {
10334 #pragma unused(ifp, pf)
10335 	return EINVAL;
10336 }
10337 
10338 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10339 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10340 {
10341 #pragma unused(ifp, sa)
10342 	return EOPNOTSUPP;
10343 }
10344 
10345 #if !XNU_TARGET_OS_OSX
10346 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10347 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10348     const struct sockaddr *sa, const char *ll, const char *t,
10349     u_int32_t *pre, u_int32_t *post)
10350 #else /* XNU_TARGET_OS_OSX */
10351 static errno_t
10352 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10353     const struct sockaddr *sa, const char *ll, const char *t)
10354 #endif /* XNU_TARGET_OS_OSX */
10355 {
10356 #pragma unused(ifp, m, sa, ll, t)
10357 #if !XNU_TARGET_OS_OSX
10358 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10359 #else /* XNU_TARGET_OS_OSX */
10360 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10361 #endif /* XNU_TARGET_OS_OSX */
10362 }
10363 
10364 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10365 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10366     const struct sockaddr *sa, const char *ll, const char *t,
10367     u_int32_t *pre, u_int32_t *post)
10368 {
10369 #pragma unused(ifp, sa, ll, t)
10370 	m_freem(*m);
10371 	*m = NULL;
10372 
10373 	if (pre != NULL) {
10374 		*pre = 0;
10375 	}
10376 	if (post != NULL) {
10377 		*post = 0;
10378 	}
10379 
10380 	return EJUSTRETURN;
10381 }
10382 
10383 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10384 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10385 {
10386 #pragma unused(ifp, cmd, arg)
10387 	return EOPNOTSUPP;
10388 }
10389 
10390 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10391 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10392 {
10393 #pragma unused(ifp, tm, f)
10394 	/* XXX not sure what to do here */
10395 	return 0;
10396 }
10397 
10398 static void
ifp_if_free(struct ifnet * ifp)10399 ifp_if_free(struct ifnet *ifp)
10400 {
10401 #pragma unused(ifp)
10402 }
10403 
10404 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10405 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10406 {
10407 #pragma unused(ifp, e)
10408 }
10409 
10410 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10411 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10412     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10413 {
10414 	struct ifnet *ifp1 = NULL;
10415 	struct dlil_ifnet *dlifp1 = NULL;
10416 	struct dlil_ifnet *dlifp1_saved = NULL;
10417 	void *buf, *base, **pbuf;
10418 	int ret = 0;
10419 
10420 	VERIFY(*ifp == NULL);
10421 	dlil_if_lock();
10422 	/*
10423 	 * We absolutely can't have an interface with the same name
10424 	 * in in-use state.
10425 	 * To make sure of that list has to be traversed completely
10426 	 */
10427 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10428 		ifp1 = (struct ifnet *)dlifp1;
10429 
10430 		if (ifp1->if_family != family) {
10431 			continue;
10432 		}
10433 
10434 		/*
10435 		 * If interface is in use, return EBUSY if either unique id
10436 		 * or interface extended names are the same
10437 		 */
10438 		lck_mtx_lock(&dlifp1->dl_if_lock);
10439 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10440 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10441 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10442 			ret = EBUSY;
10443 			goto end;
10444 		}
10445 
10446 		if (uniqueid_len != 0 &&
10447 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10448 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10449 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10450 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10451 				ret = EBUSY;
10452 				goto end;
10453 			}
10454 			if (dlifp1_saved == NULL) {
10455 				/* cache the first match */
10456 				dlifp1_saved = dlifp1;
10457 			}
10458 			/*
10459 			 * Do not break or jump to end as we have to traverse
10460 			 * the whole list to ensure there are no name collisions
10461 			 */
10462 		}
10463 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10464 	}
10465 
10466 	/* If there's an interface that can be recycled, use that */
10467 	if (dlifp1_saved != NULL) {
10468 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10469 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10470 			/* some other thread got in ahead of us */
10471 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10472 			ret = EBUSY;
10473 			goto end;
10474 		}
10475 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10476 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10477 		*ifp = (struct ifnet *)dlifp1_saved;
10478 		dlil_if_ref(*ifp);
10479 		goto end;
10480 	}
10481 
10482 	/* no interface found, allocate a new one */
10483 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10484 
10485 	/* Get the 64-bit aligned base address for this object */
10486 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10487 	    sizeof(u_int64_t));
10488 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10489 
10490 	/*
10491 	 * Wind back a pointer size from the aligned base and
10492 	 * save the original address so we can free it later.
10493 	 */
10494 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10495 	*pbuf = buf;
10496 	dlifp1 = base;
10497 
10498 	if (uniqueid_len) {
10499 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10500 		    Z_WAITOK);
10501 		if (dlifp1->dl_if_uniqueid == NULL) {
10502 			zfree(dlif_zone, buf);
10503 			ret = ENOMEM;
10504 			goto end;
10505 		}
10506 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10507 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10508 	}
10509 
10510 	ifp1 = (struct ifnet *)dlifp1;
10511 	dlifp1->dl_if_flags = DLIF_INUSE;
10512 	if (ifnet_debug) {
10513 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10514 		dlifp1->dl_if_trace = dlil_if_trace;
10515 	}
10516 	ifp1->if_name = dlifp1->dl_if_namestorage;
10517 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10518 
10519 	/* initialize interface description */
10520 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10521 	ifp1->if_desc.ifd_len = 0;
10522 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10523 
10524 #if SKYWALK
10525 	SLIST_INIT(&ifp1->if_netns_tokens);
10526 #endif /* SKYWALK */
10527 
10528 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10529 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10530 		    "error: %d\n", __func__, ret);
10531 		/* This probably shouldn't be fatal */
10532 		ret = 0;
10533 	}
10534 
10535 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10536 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10537 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10538 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10539 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10540 	    &ifnet_lock_attr);
10541 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10542 #if INET
10543 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10544 	    &ifnet_lock_attr);
10545 	ifp1->if_inetdata = NULL;
10546 #endif
10547 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10548 	ifp1->if_inet6_ioctl_busy = FALSE;
10549 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10550 	    &ifnet_lock_attr);
10551 	ifp1->if_inet6data = NULL;
10552 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10553 	    &ifnet_lock_attr);
10554 	ifp1->if_link_status = NULL;
10555 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10556 
10557 	/* for send data paths */
10558 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10559 	    &ifnet_lock_attr);
10560 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10561 	    &ifnet_lock_attr);
10562 
10563 	/* for receive data paths */
10564 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10565 	    &ifnet_lock_attr);
10566 
10567 	/* thread call allocation is done with sleeping zalloc */
10568 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10569 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10570 	if (ifp1->if_dt_tcall == NULL) {
10571 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10572 		/* NOTREACHED */
10573 	}
10574 
10575 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10576 
10577 	*ifp = ifp1;
10578 	dlil_if_ref(*ifp);
10579 
10580 end:
10581 	dlil_if_unlock();
10582 
10583 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10584 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10585 
10586 	return ret;
10587 }
10588 
10589 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10590 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10591 {
10592 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10593 
10594 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10595 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10596 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10597 	}
10598 
10599 	ifnet_lock_exclusive(ifp);
10600 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10601 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10602 		ifp->if_broadcast.length = 0;
10603 		ifp->if_broadcast.u.ptr = NULL;
10604 	}
10605 	lck_mtx_lock(&dlifp->dl_if_lock);
10606 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10607 	ifp->if_name = dlifp->dl_if_namestorage;
10608 	/* Reset external name (name + unit) */
10609 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10610 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10611 	    "%s?", ifp->if_name);
10612 	if (clear_in_use) {
10613 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10614 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10615 	}
10616 	lck_mtx_unlock(&dlifp->dl_if_lock);
10617 	ifnet_lock_done(ifp);
10618 }
10619 
10620 __private_extern__ void
dlil_if_release(ifnet_t ifp)10621 dlil_if_release(ifnet_t ifp)
10622 {
10623 	_dlil_if_release(ifp, false);
10624 }
10625 
10626 __private_extern__ void
dlil_if_lock(void)10627 dlil_if_lock(void)
10628 {
10629 	lck_mtx_lock(&dlil_ifnet_lock);
10630 }
10631 
10632 __private_extern__ void
dlil_if_unlock(void)10633 dlil_if_unlock(void)
10634 {
10635 	lck_mtx_unlock(&dlil_ifnet_lock);
10636 }
10637 
10638 __private_extern__ void
dlil_if_lock_assert(void)10639 dlil_if_lock_assert(void)
10640 {
10641 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10642 }
10643 
10644 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10645 dlil_proto_unplumb_all(struct ifnet *ifp)
10646 {
10647 	/*
10648 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10649 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10650 	 * explicit unplumb.
10651 	 *
10652 	 * if_proto_hash[3] is for other protocols; we expect anything
10653 	 * in this bucket to respond to the DETACHING event (which would
10654 	 * have happened by now) and do the unplumb then.
10655 	 */
10656 	(void) proto_unplumb(PF_INET, ifp);
10657 	(void) proto_unplumb(PF_INET6, ifp);
10658 }
10659 
10660 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10661 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10662 {
10663 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10664 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10665 
10666 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10667 
10668 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10669 }
10670 
10671 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10672 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10673 {
10674 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10675 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10676 
10677 	if (ifp->if_fwd_cacheok) {
10678 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10679 	} else {
10680 		ROUTE_RELEASE(src);
10681 	}
10682 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10683 }
10684 
10685 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10686 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10687 {
10688 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10689 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10690 
10691 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10692 	    sizeof(*dst));
10693 
10694 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10695 }
10696 
10697 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10698 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10699 {
10700 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10701 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10702 
10703 	if (ifp->if_fwd_cacheok) {
10704 		route_copyin((struct route *)src,
10705 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10706 	} else {
10707 		ROUTE_RELEASE(src);
10708 	}
10709 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10710 }
10711 
10712 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10713 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10714 {
10715 	struct route            src_rt;
10716 	struct sockaddr_in      *dst;
10717 
10718 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10719 
10720 	ifp_src_route_copyout(ifp, &src_rt);
10721 
10722 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10723 		ROUTE_RELEASE(&src_rt);
10724 		if (dst->sin_family != AF_INET) {
10725 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10726 			dst->sin_len = sizeof(src_rt.ro_dst);
10727 			dst->sin_family = AF_INET;
10728 		}
10729 		dst->sin_addr = src_ip;
10730 
10731 		VERIFY(src_rt.ro_rt == NULL);
10732 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10733 		    0, 0, ifp->if_index);
10734 
10735 		if (src_rt.ro_rt != NULL) {
10736 			/* retain a ref, copyin consumes one */
10737 			struct rtentry  *rte = src_rt.ro_rt;
10738 			RT_ADDREF(rte);
10739 			ifp_src_route_copyin(ifp, &src_rt);
10740 			src_rt.ro_rt = rte;
10741 		}
10742 	}
10743 
10744 	return src_rt.ro_rt;
10745 }
10746 
10747 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10748 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10749 {
10750 	struct route_in6 src_rt;
10751 
10752 	ifp_src_route6_copyout(ifp, &src_rt);
10753 
10754 	if (ROUTE_UNUSABLE(&src_rt) ||
10755 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10756 		ROUTE_RELEASE(&src_rt);
10757 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10758 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10759 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10760 			src_rt.ro_dst.sin6_family = AF_INET6;
10761 		}
10762 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10763 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10764 		    sizeof(src_rt.ro_dst.sin6_addr));
10765 
10766 		if (src_rt.ro_rt == NULL) {
10767 			src_rt.ro_rt = rtalloc1_scoped(
10768 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10769 				ifp->if_index);
10770 
10771 			if (src_rt.ro_rt != NULL) {
10772 				/* retain a ref, copyin consumes one */
10773 				struct rtentry  *rte = src_rt.ro_rt;
10774 				RT_ADDREF(rte);
10775 				ifp_src_route6_copyin(ifp, &src_rt);
10776 				src_rt.ro_rt = rte;
10777 			}
10778 		}
10779 	}
10780 
10781 	return src_rt.ro_rt;
10782 }
10783 
10784 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10785 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10786 {
10787 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10788 
10789 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10790 
10791 	/* Normalize to edge */
10792 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10793 		lqm = IFNET_LQM_THRESH_ABORT;
10794 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10795 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10796 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10797 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10798 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10799 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10800 	    lqm <= IFNET_LQM_THRESH_POOR) {
10801 		lqm = IFNET_LQM_THRESH_POOR;
10802 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10803 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10804 		lqm = IFNET_LQM_THRESH_GOOD;
10805 	}
10806 
10807 	/*
10808 	 * Take the lock if needed
10809 	 */
10810 	if (!locked) {
10811 		ifnet_lock_exclusive(ifp);
10812 	}
10813 
10814 	if (lqm == ifp->if_interface_state.lqm_state &&
10815 	    (ifp->if_interface_state.valid_bitmask &
10816 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10817 		/*
10818 		 * Release the lock if was not held by the caller
10819 		 */
10820 		if (!locked) {
10821 			ifnet_lock_done(ifp);
10822 		}
10823 		return;         /* nothing to update */
10824 	}
10825 	ifp->if_interface_state.valid_bitmask |=
10826 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10827 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10828 
10829 	/*
10830 	 * Don't want to hold the lock when issuing kernel events
10831 	 */
10832 	ifnet_lock_done(ifp);
10833 
10834 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10835 	ev_lqm_data.link_quality_metric = lqm;
10836 
10837 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10838 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10839 
10840 	/*
10841 	 * Reacquire the lock for the caller
10842 	 */
10843 	if (locked) {
10844 		ifnet_lock_exclusive(ifp);
10845 	}
10846 }
10847 
10848 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10849 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10850 {
10851 	struct kev_dl_rrc_state kev;
10852 
10853 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10854 	    (ifp->if_interface_state.valid_bitmask &
10855 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10856 		return;
10857 	}
10858 
10859 	ifp->if_interface_state.valid_bitmask |=
10860 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10861 
10862 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10863 
10864 	/*
10865 	 * Don't want to hold the lock when issuing kernel events
10866 	 */
10867 	ifnet_lock_done(ifp);
10868 
10869 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10870 	kev.rrc_state = rrc_state;
10871 
10872 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10873 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10874 
10875 	ifnet_lock_exclusive(ifp);
10876 }
10877 
10878 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10879 if_state_update(struct ifnet *ifp,
10880     struct if_interface_state *if_interface_state)
10881 {
10882 	u_short if_index_available = 0;
10883 
10884 	ifnet_lock_exclusive(ifp);
10885 
10886 	if ((ifp->if_type != IFT_CELLULAR) &&
10887 	    (if_interface_state->valid_bitmask &
10888 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10889 		ifnet_lock_done(ifp);
10890 		return ENOTSUP;
10891 	}
10892 	if ((if_interface_state->valid_bitmask &
10893 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10894 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10895 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10896 		ifnet_lock_done(ifp);
10897 		return EINVAL;
10898 	}
10899 	if ((if_interface_state->valid_bitmask &
10900 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10901 	    if_interface_state->rrc_state !=
10902 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10903 	    if_interface_state->rrc_state !=
10904 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10905 		ifnet_lock_done(ifp);
10906 		return EINVAL;
10907 	}
10908 
10909 	if (if_interface_state->valid_bitmask &
10910 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10911 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10912 	}
10913 	if (if_interface_state->valid_bitmask &
10914 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10915 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10916 	}
10917 	if (if_interface_state->valid_bitmask &
10918 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10919 		ifp->if_interface_state.valid_bitmask |=
10920 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10921 		ifp->if_interface_state.interface_availability =
10922 		    if_interface_state->interface_availability;
10923 
10924 		if (ifp->if_interface_state.interface_availability ==
10925 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10926 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10927 			    __func__, if_name(ifp), ifp->if_index);
10928 			if_index_available = ifp->if_index;
10929 		} else {
10930 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10931 			    __func__, if_name(ifp), ifp->if_index);
10932 		}
10933 	}
10934 	ifnet_lock_done(ifp);
10935 
10936 	/*
10937 	 * Check if the TCP connections going on this interface should be
10938 	 * forced to send probe packets instead of waiting for TCP timers
10939 	 * to fire. This is done on an explicit notification such as
10940 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10941 	 */
10942 	if (if_index_available > 0) {
10943 		tcp_interface_send_probe(if_index_available);
10944 	}
10945 
10946 	return 0;
10947 }
10948 
10949 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10950 if_get_state(struct ifnet *ifp,
10951     struct if_interface_state *if_interface_state)
10952 {
10953 	ifnet_lock_shared(ifp);
10954 
10955 	if_interface_state->valid_bitmask = 0;
10956 
10957 	if (ifp->if_interface_state.valid_bitmask &
10958 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10959 		if_interface_state->valid_bitmask |=
10960 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10961 		if_interface_state->rrc_state =
10962 		    ifp->if_interface_state.rrc_state;
10963 	}
10964 	if (ifp->if_interface_state.valid_bitmask &
10965 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10966 		if_interface_state->valid_bitmask |=
10967 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10968 		if_interface_state->lqm_state =
10969 		    ifp->if_interface_state.lqm_state;
10970 	}
10971 	if (ifp->if_interface_state.valid_bitmask &
10972 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10973 		if_interface_state->valid_bitmask |=
10974 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10975 		if_interface_state->interface_availability =
10976 		    ifp->if_interface_state.interface_availability;
10977 	}
10978 
10979 	ifnet_lock_done(ifp);
10980 }
10981 
10982 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10983 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10984 {
10985 	if (conn_probe > 1) {
10986 		return EINVAL;
10987 	}
10988 	if (conn_probe == 0) {
10989 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10990 	} else {
10991 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10992 	}
10993 
10994 #if NECP
10995 	necp_update_all_clients();
10996 #endif /* NECP */
10997 
10998 	tcp_probe_connectivity(ifp, conn_probe);
10999 	return 0;
11000 }
11001 
11002 /* for uuid.c */
11003 static int
get_ether_index(int * ret_other_index)11004 get_ether_index(int * ret_other_index)
11005 {
11006 	struct ifnet *ifp;
11007 	int en0_index = 0;
11008 	int other_en_index = 0;
11009 	int any_ether_index = 0;
11010 	short best_unit = 0;
11011 
11012 	*ret_other_index = 0;
11013 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11014 		/*
11015 		 * find en0, or if not en0, the lowest unit en*, and if not
11016 		 * that, any ethernet
11017 		 */
11018 		ifnet_lock_shared(ifp);
11019 		if (strcmp(ifp->if_name, "en") == 0) {
11020 			if (ifp->if_unit == 0) {
11021 				/* found en0, we're done */
11022 				en0_index = ifp->if_index;
11023 				ifnet_lock_done(ifp);
11024 				break;
11025 			}
11026 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
11027 				other_en_index = ifp->if_index;
11028 				best_unit = ifp->if_unit;
11029 			}
11030 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11031 			any_ether_index = ifp->if_index;
11032 		}
11033 		ifnet_lock_done(ifp);
11034 	}
11035 	if (en0_index == 0) {
11036 		if (other_en_index != 0) {
11037 			*ret_other_index = other_en_index;
11038 		} else if (any_ether_index != 0) {
11039 			*ret_other_index = any_ether_index;
11040 		}
11041 	}
11042 	return en0_index;
11043 }
11044 
11045 int
uuid_get_ethernet(u_int8_t * node)11046 uuid_get_ethernet(u_int8_t *node)
11047 {
11048 	static int en0_index;
11049 	struct ifnet *ifp;
11050 	int other_index = 0;
11051 	int the_index = 0;
11052 	int ret;
11053 
11054 	ifnet_head_lock_shared();
11055 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11056 		en0_index = get_ether_index(&other_index);
11057 	}
11058 	if (en0_index != 0) {
11059 		the_index = en0_index;
11060 	} else if (other_index != 0) {
11061 		the_index = other_index;
11062 	}
11063 	if (the_index != 0) {
11064 		struct dlil_ifnet *dl_if;
11065 
11066 		ifp = ifindex2ifnet[the_index];
11067 		VERIFY(ifp != NULL);
11068 		dl_if = (struct dlil_ifnet *)ifp;
11069 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
11070 			/*
11071 			 * Use the permanent ethernet address if it is
11072 			 * available because it will never change.
11073 			 */
11074 			memcpy(node, dl_if->dl_if_permanent_ether,
11075 			    ETHER_ADDR_LEN);
11076 		} else {
11077 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11078 		}
11079 		ret = 0;
11080 	} else {
11081 		ret = -1;
11082 	}
11083 	ifnet_head_done();
11084 	return ret;
11085 }
11086 
11087 static int
11088 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11089 {
11090 #pragma unused(arg1, arg2)
11091 	uint32_t i;
11092 	int err;
11093 
11094 	i = if_rxpoll;
11095 
11096 	err = sysctl_handle_int(oidp, &i, 0, req);
11097 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11098 		return err;
11099 	}
11100 
11101 	if (net_rxpoll == 0) {
11102 		return ENXIO;
11103 	}
11104 
11105 	if_rxpoll = i;
11106 	return err;
11107 }
11108 
11109 static int
11110 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11111 {
11112 #pragma unused(arg1, arg2)
11113 	uint64_t q;
11114 	int err;
11115 
11116 	q = if_rxpoll_mode_holdtime;
11117 
11118 	err = sysctl_handle_quad(oidp, &q, 0, req);
11119 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11120 		return err;
11121 	}
11122 
11123 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11124 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11125 	}
11126 
11127 	if_rxpoll_mode_holdtime = q;
11128 
11129 	return err;
11130 }
11131 
11132 static int
11133 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11134 {
11135 #pragma unused(arg1, arg2)
11136 	uint64_t q;
11137 	int err;
11138 
11139 	q = if_rxpoll_sample_holdtime;
11140 
11141 	err = sysctl_handle_quad(oidp, &q, 0, req);
11142 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11143 		return err;
11144 	}
11145 
11146 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11147 		q = IF_RXPOLL_SAMPLETIME_MIN;
11148 	}
11149 
11150 	if_rxpoll_sample_holdtime = q;
11151 
11152 	return err;
11153 }
11154 
11155 static int
11156 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11157 {
11158 #pragma unused(arg1, arg2)
11159 	uint64_t q;
11160 	int err;
11161 
11162 	q = if_rxpoll_interval_time;
11163 
11164 	err = sysctl_handle_quad(oidp, &q, 0, req);
11165 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11166 		return err;
11167 	}
11168 
11169 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11170 		q = IF_RXPOLL_INTERVALTIME_MIN;
11171 	}
11172 
11173 	if_rxpoll_interval_time = q;
11174 
11175 	return err;
11176 }
11177 
11178 static int
11179 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11180 {
11181 #pragma unused(arg1, arg2)
11182 	uint32_t i;
11183 	int err;
11184 
11185 	i = if_sysctl_rxpoll_wlowat;
11186 
11187 	err = sysctl_handle_int(oidp, &i, 0, req);
11188 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11189 		return err;
11190 	}
11191 
11192 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11193 		return EINVAL;
11194 	}
11195 
11196 	if_sysctl_rxpoll_wlowat = i;
11197 	return err;
11198 }
11199 
11200 static int
11201 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11202 {
11203 #pragma unused(arg1, arg2)
11204 	uint32_t i;
11205 	int err;
11206 
11207 	i = if_sysctl_rxpoll_whiwat;
11208 
11209 	err = sysctl_handle_int(oidp, &i, 0, req);
11210 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11211 		return err;
11212 	}
11213 
11214 	if (i <= if_sysctl_rxpoll_wlowat) {
11215 		return EINVAL;
11216 	}
11217 
11218 	if_sysctl_rxpoll_whiwat = i;
11219 	return err;
11220 }
11221 
11222 static int
11223 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11224 {
11225 #pragma unused(arg1, arg2)
11226 	int i, err;
11227 
11228 	i = if_sndq_maxlen;
11229 
11230 	err = sysctl_handle_int(oidp, &i, 0, req);
11231 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11232 		return err;
11233 	}
11234 
11235 	if (i < IF_SNDQ_MINLEN) {
11236 		i = IF_SNDQ_MINLEN;
11237 	}
11238 
11239 	if_sndq_maxlen = i;
11240 	return err;
11241 }
11242 
11243 static int
11244 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11245 {
11246 #pragma unused(arg1, arg2)
11247 	int i, err;
11248 
11249 	i = if_rcvq_maxlen;
11250 
11251 	err = sysctl_handle_int(oidp, &i, 0, req);
11252 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11253 		return err;
11254 	}
11255 
11256 	if (i < IF_RCVQ_MINLEN) {
11257 		i = IF_RCVQ_MINLEN;
11258 	}
11259 
11260 	if_rcvq_maxlen = i;
11261 	return err;
11262 }
11263 
11264 static int
11265 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11266 {
11267 #pragma unused(arg1, arg2)
11268 	int i, err;
11269 
11270 	i = if_rcvq_burst_limit;
11271 
11272 	err = sysctl_handle_int(oidp, &i, 0, req);
11273 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11274 		return err;
11275 	}
11276 
11277 /*
11278  * Safeguard the burst limit to "sane" values on customer builds.
11279  */
11280 #if !(DEVELOPMENT || DEBUG)
11281 	if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11282 		i = IF_RCVQ_BURST_LIMIT_MIN;
11283 	}
11284 
11285 	if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11286 		i = IF_RCVQ_BURST_LIMIT_MAX;
11287 	}
11288 #endif
11289 
11290 	if_rcvq_burst_limit = i;
11291 	return err;
11292 }
11293 
11294 static int
11295 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11296 {
11297 #pragma unused(arg1, arg2)
11298 	int i, err;
11299 
11300 	i = if_rcvq_burst_limit;
11301 
11302 	err = sysctl_handle_int(oidp, &i, 0, req);
11303 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11304 		return err;
11305 	}
11306 
11307 	if (IF_RCVQ_TRIM_PCT_MAX < i) {
11308 		i = IF_RCVQ_TRIM_PCT_MAX;
11309 	}
11310 
11311 	if (i < IF_RCVQ_TRIM_PCT_MIN) {
11312 		i = IF_RCVQ_TRIM_PCT_MIN;
11313 	}
11314 
11315 	if_rcvq_trim_pct = i;
11316 	return err;
11317 }
11318 
11319 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11320 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11321     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11322 {
11323 	struct kev_dl_node_presence kev;
11324 	struct sockaddr_dl *sdl;
11325 	struct sockaddr_in6 *sin6;
11326 	int ret = 0;
11327 
11328 	VERIFY(ifp);
11329 	VERIFY(sa);
11330 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11331 
11332 	bzero(&kev, sizeof(kev));
11333 	sin6 = &kev.sin6_node_address;
11334 	sdl = &kev.sdl_node_address;
11335 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11336 	kev.rssi = rssi;
11337 	kev.link_quality_metric = lqm;
11338 	kev.node_proximity_metric = npm;
11339 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11340 
11341 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11342 	if (ret == 0 || ret == EEXIST) {
11343 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11344 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11345 		if (err != 0) {
11346 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11347 			    "error %d\n", __func__, err);
11348 		}
11349 	}
11350 
11351 	if (ret == EEXIST) {
11352 		ret = 0;
11353 	}
11354 	return ret;
11355 }
11356 
11357 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11358 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11359 {
11360 	struct kev_dl_node_absence kev = {};
11361 	struct sockaddr_in6 *kev_sin6 = NULL;
11362 	struct sockaddr_dl *kev_sdl = NULL;
11363 	int error = 0;
11364 
11365 	VERIFY(ifp != NULL);
11366 	VERIFY(sa != NULL);
11367 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11368 
11369 	kev_sin6 = &kev.sin6_node_address;
11370 	kev_sdl = &kev.sdl_node_address;
11371 
11372 	if (sa->sa_family == AF_INET6) {
11373 		/*
11374 		 * If IPv6 address is given, get the link layer
11375 		 * address from what was cached in the neighbor cache
11376 		 */
11377 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11378 		bcopy(sa, kev_sin6, sa->sa_len);
11379 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11380 	} else {
11381 		/*
11382 		 * If passed address is AF_LINK type, derive the address
11383 		 * based on the link address.
11384 		 */
11385 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11386 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11387 	}
11388 
11389 	if (error == 0) {
11390 		kev_sdl->sdl_type = ifp->if_type;
11391 		kev_sdl->sdl_index = ifp->if_index;
11392 
11393 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11394 		    &kev.link_data, sizeof(kev), FALSE);
11395 	}
11396 }
11397 
11398 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11399 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11400     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11401 {
11402 	struct kev_dl_node_presence kev = {};
11403 	struct sockaddr_dl *kev_sdl = NULL;
11404 	struct sockaddr_in6 *kev_sin6 = NULL;
11405 	int ret = 0;
11406 
11407 	VERIFY(ifp != NULL);
11408 	VERIFY(sa != NULL && sdl != NULL);
11409 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11410 
11411 	kev_sin6 = &kev.sin6_node_address;
11412 	kev_sdl = &kev.sdl_node_address;
11413 
11414 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11415 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11416 	kev_sdl->sdl_type = ifp->if_type;
11417 	kev_sdl->sdl_index = ifp->if_index;
11418 
11419 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11420 	bcopy(sa, kev_sin6, sa->sa_len);
11421 
11422 	kev.rssi = rssi;
11423 	kev.link_quality_metric = lqm;
11424 	kev.node_proximity_metric = npm;
11425 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11426 
11427 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11428 	if (ret == 0 || ret == EEXIST) {
11429 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11430 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11431 		if (err != 0) {
11432 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11433 		}
11434 	}
11435 
11436 	if (ret == EEXIST) {
11437 		ret = 0;
11438 	}
11439 	return ret;
11440 }
11441 
11442 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11443 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11444     kauth_cred_t *credp)
11445 {
11446 	const u_int8_t *bytes;
11447 	size_t size;
11448 
11449 	bytes = CONST_LLADDR(sdl);
11450 	size = sdl->sdl_alen;
11451 
11452 #if CONFIG_MACF
11453 	if (dlil_lladdr_ckreq) {
11454 		switch (sdl->sdl_type) {
11455 		case IFT_ETHER:
11456 		case IFT_IEEE1394:
11457 			break;
11458 		default:
11459 			credp = NULL;
11460 			break;
11461 		}
11462 		;
11463 
11464 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11465 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11466 				[0] = 2
11467 			};
11468 
11469 			bytes = unspec;
11470 		}
11471 	}
11472 #else
11473 #pragma unused(credp)
11474 #endif
11475 
11476 	if (sizep != NULL) {
11477 		*sizep = size;
11478 	}
11479 	return bytes;
11480 }
11481 
11482 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11483 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11484     u_int8_t info[DLIL_MODARGLEN])
11485 {
11486 	struct kev_dl_issues kev;
11487 	struct timeval tv;
11488 
11489 	VERIFY(ifp != NULL);
11490 	VERIFY(modid != NULL);
11491 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11492 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11493 
11494 	bzero(&kev, sizeof(kev));
11495 
11496 	microtime(&tv);
11497 	kev.timestamp = tv.tv_sec;
11498 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11499 	if (info != NULL) {
11500 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11501 	}
11502 
11503 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11504 	    &kev.link_data, sizeof(kev), FALSE);
11505 }
11506 
11507 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11508 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11509     struct proc *p)
11510 {
11511 	u_int32_t level = IFNET_THROTTLE_OFF;
11512 	errno_t result = 0;
11513 
11514 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11515 
11516 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11517 		/*
11518 		 * XXX: Use priv_check_cred() instead of root check?
11519 		 */
11520 		if ((result = proc_suser(p)) != 0) {
11521 			return result;
11522 		}
11523 
11524 		if (ifr->ifr_opportunistic.ifo_flags ==
11525 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11526 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11527 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11528 			level = IFNET_THROTTLE_OFF;
11529 		} else {
11530 			result = EINVAL;
11531 		}
11532 
11533 		if (result == 0) {
11534 			result = ifnet_set_throttle(ifp, level);
11535 		}
11536 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11537 		ifr->ifr_opportunistic.ifo_flags = 0;
11538 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11539 			ifr->ifr_opportunistic.ifo_flags |=
11540 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11541 		}
11542 	}
11543 
11544 	/*
11545 	 * Return the count of current opportunistic connections
11546 	 * over the interface.
11547 	 */
11548 	if (result == 0) {
11549 		uint32_t flags = 0;
11550 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11551 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11552 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11553 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11554 		ifr->ifr_opportunistic.ifo_inuse =
11555 		    udp_count_opportunistic(ifp->if_index, flags) +
11556 		    tcp_count_opportunistic(ifp->if_index, flags);
11557 	}
11558 
11559 	if (result == EALREADY) {
11560 		result = 0;
11561 	}
11562 
11563 	return result;
11564 }
11565 
11566 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11567 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11568 {
11569 	struct ifclassq *ifq;
11570 	int err = 0;
11571 
11572 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11573 		return ENXIO;
11574 	}
11575 
11576 	*level = IFNET_THROTTLE_OFF;
11577 
11578 	ifq = ifp->if_snd;
11579 	IFCQ_LOCK(ifq);
11580 	/* Throttling works only for IFCQ, not ALTQ instances */
11581 	if (IFCQ_IS_ENABLED(ifq)) {
11582 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11583 
11584 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11585 		*level = req.level;
11586 	}
11587 	IFCQ_UNLOCK(ifq);
11588 
11589 	return err;
11590 }
11591 
11592 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11593 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11594 {
11595 	struct ifclassq *ifq;
11596 	int err = 0;
11597 
11598 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11599 		return ENXIO;
11600 	}
11601 
11602 	ifq = ifp->if_snd;
11603 
11604 	switch (level) {
11605 	case IFNET_THROTTLE_OFF:
11606 	case IFNET_THROTTLE_OPPORTUNISTIC:
11607 		break;
11608 	default:
11609 		return EINVAL;
11610 	}
11611 
11612 	IFCQ_LOCK(ifq);
11613 	if (IFCQ_IS_ENABLED(ifq)) {
11614 		cqrq_throttle_t req = { 1, level };
11615 
11616 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11617 	}
11618 	IFCQ_UNLOCK(ifq);
11619 
11620 	if (err == 0) {
11621 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11622 		    level);
11623 #if NECP
11624 		necp_update_all_clients();
11625 #endif /* NECP */
11626 		if (level == IFNET_THROTTLE_OFF) {
11627 			ifnet_start(ifp);
11628 		}
11629 	}
11630 
11631 	return err;
11632 }
11633 
11634 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11635 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11636     struct proc *p)
11637 {
11638 #pragma unused(p)
11639 	errno_t result = 0;
11640 	uint32_t flags;
11641 	int level, category, subcategory;
11642 
11643 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11644 
11645 	if (cmd == SIOCSIFLOG) {
11646 		if ((result = priv_check_cred(kauth_cred_get(),
11647 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11648 			return result;
11649 		}
11650 
11651 		level = ifr->ifr_log.ifl_level;
11652 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11653 			result = EINVAL;
11654 		}
11655 
11656 		flags = ifr->ifr_log.ifl_flags;
11657 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11658 			result = EINVAL;
11659 		}
11660 
11661 		category = ifr->ifr_log.ifl_category;
11662 		subcategory = ifr->ifr_log.ifl_subcategory;
11663 
11664 		if (result == 0) {
11665 			result = ifnet_set_log(ifp, level, flags,
11666 			    category, subcategory);
11667 		}
11668 	} else {
11669 		result = ifnet_get_log(ifp, &level, &flags, &category,
11670 		    &subcategory);
11671 		if (result == 0) {
11672 			ifr->ifr_log.ifl_level = level;
11673 			ifr->ifr_log.ifl_flags = flags;
11674 			ifr->ifr_log.ifl_category = category;
11675 			ifr->ifr_log.ifl_subcategory = subcategory;
11676 		}
11677 	}
11678 
11679 	return result;
11680 }
11681 
11682 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11683 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11684     int32_t category, int32_t subcategory)
11685 {
11686 	int err = 0;
11687 
11688 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11689 	VERIFY(flags & IFNET_LOGF_MASK);
11690 
11691 	/*
11692 	 * The logging level applies to all facilities; make sure to
11693 	 * update them all with the most current level.
11694 	 */
11695 	flags |= ifp->if_log.flags;
11696 
11697 	if (ifp->if_output_ctl != NULL) {
11698 		struct ifnet_log_params l;
11699 
11700 		bzero(&l, sizeof(l));
11701 		l.level = level;
11702 		l.flags = flags;
11703 		l.flags &= ~IFNET_LOGF_DLIL;
11704 		l.category = category;
11705 		l.subcategory = subcategory;
11706 
11707 		/* Send this request to lower layers */
11708 		if (l.flags != 0) {
11709 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11710 			    sizeof(l), &l);
11711 		}
11712 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11713 		/*
11714 		 * If targeted to the lower layers without an output
11715 		 * control callback registered on the interface, just
11716 		 * silently ignore facilities other than ours.
11717 		 */
11718 		flags &= IFNET_LOGF_DLIL;
11719 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11720 			level = 0;
11721 		}
11722 	}
11723 
11724 	if (err == 0) {
11725 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11726 			ifp->if_log.flags = 0;
11727 		} else {
11728 			ifp->if_log.flags |= flags;
11729 		}
11730 
11731 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11732 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11733 		    ifp->if_log.level, ifp->if_log.flags,
11734 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11735 		    category, subcategory);
11736 	}
11737 
11738 	return err;
11739 }
11740 
11741 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11742 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11743     int32_t *category, int32_t *subcategory)
11744 {
11745 	if (level != NULL) {
11746 		*level = ifp->if_log.level;
11747 	}
11748 	if (flags != NULL) {
11749 		*flags = ifp->if_log.flags;
11750 	}
11751 	if (category != NULL) {
11752 		*category = ifp->if_log.category;
11753 	}
11754 	if (subcategory != NULL) {
11755 		*subcategory = ifp->if_log.subcategory;
11756 	}
11757 
11758 	return 0;
11759 }
11760 
11761 int
ifnet_notify_address(struct ifnet * ifp,int af)11762 ifnet_notify_address(struct ifnet *ifp, int af)
11763 {
11764 	struct ifnet_notify_address_params na;
11765 
11766 #if PF
11767 	(void) pf_ifaddr_hook(ifp);
11768 #endif /* PF */
11769 
11770 	if (ifp->if_output_ctl == NULL) {
11771 		return EOPNOTSUPP;
11772 	}
11773 
11774 	bzero(&na, sizeof(na));
11775 	na.address_family = (sa_family_t)af;
11776 
11777 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11778 	           sizeof(na), &na);
11779 }
11780 
11781 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11782 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11783 {
11784 	if (ifp == NULL || flowid == NULL) {
11785 		return EINVAL;
11786 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11787 	    !IF_FULLY_ATTACHED(ifp)) {
11788 		return ENXIO;
11789 	}
11790 
11791 	*flowid = ifp->if_flowhash;
11792 
11793 	return 0;
11794 }
11795 
11796 errno_t
ifnet_disable_output(struct ifnet * ifp)11797 ifnet_disable_output(struct ifnet *ifp)
11798 {
11799 	int err;
11800 
11801 	if (ifp == NULL) {
11802 		return EINVAL;
11803 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11804 	    !IF_FULLY_ATTACHED(ifp)) {
11805 		return ENXIO;
11806 	}
11807 
11808 	if ((err = ifnet_fc_add(ifp)) == 0) {
11809 		lck_mtx_lock_spin(&ifp->if_start_lock);
11810 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11811 		lck_mtx_unlock(&ifp->if_start_lock);
11812 	}
11813 	return err;
11814 }
11815 
11816 errno_t
ifnet_enable_output(struct ifnet * ifp)11817 ifnet_enable_output(struct ifnet *ifp)
11818 {
11819 	if (ifp == NULL) {
11820 		return EINVAL;
11821 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11822 	    !IF_FULLY_ATTACHED(ifp)) {
11823 		return ENXIO;
11824 	}
11825 
11826 	ifnet_start_common(ifp, TRUE, FALSE);
11827 	return 0;
11828 }
11829 
11830 void
ifnet_flowadv(uint32_t flowhash)11831 ifnet_flowadv(uint32_t flowhash)
11832 {
11833 	struct ifnet_fc_entry *ifce;
11834 	struct ifnet *ifp;
11835 
11836 	ifce = ifnet_fc_get(flowhash);
11837 	if (ifce == NULL) {
11838 		return;
11839 	}
11840 
11841 	VERIFY(ifce->ifce_ifp != NULL);
11842 	ifp = ifce->ifce_ifp;
11843 
11844 	/* flow hash gets recalculated per attach, so check */
11845 	if (ifnet_is_attached(ifp, 1)) {
11846 		if (ifp->if_flowhash == flowhash) {
11847 			(void) ifnet_enable_output(ifp);
11848 		}
11849 		ifnet_decr_iorefcnt(ifp);
11850 	}
11851 	ifnet_fc_entry_free(ifce);
11852 }
11853 
11854 /*
11855  * Function to compare ifnet_fc_entries in ifnet flow control tree
11856  */
11857 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11858 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11859 {
11860 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11861 }
11862 
11863 static int
ifnet_fc_add(struct ifnet * ifp)11864 ifnet_fc_add(struct ifnet *ifp)
11865 {
11866 	struct ifnet_fc_entry keyfc, *ifce;
11867 	uint32_t flowhash;
11868 
11869 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11870 	VERIFY(ifp->if_flowhash != 0);
11871 	flowhash = ifp->if_flowhash;
11872 
11873 	bzero(&keyfc, sizeof(keyfc));
11874 	keyfc.ifce_flowhash = flowhash;
11875 
11876 	lck_mtx_lock_spin(&ifnet_fc_lock);
11877 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11878 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11879 		/* Entry is already in ifnet_fc_tree, return */
11880 		lck_mtx_unlock(&ifnet_fc_lock);
11881 		return 0;
11882 	}
11883 
11884 	if (ifce != NULL) {
11885 		/*
11886 		 * There is a different fc entry with the same flow hash
11887 		 * but different ifp pointer.  There can be a collision
11888 		 * on flow hash but the probability is low.  Let's just
11889 		 * avoid adding a second one when there is a collision.
11890 		 */
11891 		lck_mtx_unlock(&ifnet_fc_lock);
11892 		return EAGAIN;
11893 	}
11894 
11895 	/* become regular mutex */
11896 	lck_mtx_convert_spin(&ifnet_fc_lock);
11897 
11898 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11899 	ifce->ifce_flowhash = flowhash;
11900 	ifce->ifce_ifp = ifp;
11901 
11902 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11903 	lck_mtx_unlock(&ifnet_fc_lock);
11904 	return 0;
11905 }
11906 
11907 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11908 ifnet_fc_get(uint32_t flowhash)
11909 {
11910 	struct ifnet_fc_entry keyfc, *ifce;
11911 	struct ifnet *ifp;
11912 
11913 	bzero(&keyfc, sizeof(keyfc));
11914 	keyfc.ifce_flowhash = flowhash;
11915 
11916 	lck_mtx_lock_spin(&ifnet_fc_lock);
11917 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11918 	if (ifce == NULL) {
11919 		/* Entry is not present in ifnet_fc_tree, return */
11920 		lck_mtx_unlock(&ifnet_fc_lock);
11921 		return NULL;
11922 	}
11923 
11924 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11925 
11926 	VERIFY(ifce->ifce_ifp != NULL);
11927 	ifp = ifce->ifce_ifp;
11928 
11929 	/* become regular mutex */
11930 	lck_mtx_convert_spin(&ifnet_fc_lock);
11931 
11932 	if (!ifnet_is_attached(ifp, 0)) {
11933 		/*
11934 		 * This ifp is not attached or in the process of being
11935 		 * detached; just don't process it.
11936 		 */
11937 		ifnet_fc_entry_free(ifce);
11938 		ifce = NULL;
11939 	}
11940 	lck_mtx_unlock(&ifnet_fc_lock);
11941 
11942 	return ifce;
11943 }
11944 
11945 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11946 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11947 {
11948 	zfree(ifnet_fc_zone, ifce);
11949 }
11950 
11951 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11952 ifnet_calc_flowhash(struct ifnet *ifp)
11953 {
11954 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11955 	uint32_t flowhash = 0;
11956 
11957 	if (ifnet_flowhash_seed == 0) {
11958 		ifnet_flowhash_seed = RandomULong();
11959 	}
11960 
11961 	bzero(&fh, sizeof(fh));
11962 
11963 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11964 	fh.ifk_unit = ifp->if_unit;
11965 	fh.ifk_flags = ifp->if_flags;
11966 	fh.ifk_eflags = ifp->if_eflags;
11967 	fh.ifk_capabilities = ifp->if_capabilities;
11968 	fh.ifk_capenable = ifp->if_capenable;
11969 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11970 	fh.ifk_rand1 = RandomULong();
11971 	fh.ifk_rand2 = RandomULong();
11972 
11973 try_again:
11974 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11975 	if (flowhash == 0) {
11976 		/* try to get a non-zero flowhash */
11977 		ifnet_flowhash_seed = RandomULong();
11978 		goto try_again;
11979 	}
11980 
11981 	return flowhash;
11982 }
11983 
11984 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11985 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11986     uint16_t flags, uint8_t *data)
11987 {
11988 #pragma unused(flags)
11989 	int error = 0;
11990 
11991 	switch (family) {
11992 	case AF_INET:
11993 		if_inetdata_lock_exclusive(ifp);
11994 		if (IN_IFEXTRA(ifp) != NULL) {
11995 			if (len == 0) {
11996 				/* Allow clearing the signature */
11997 				IN_IFEXTRA(ifp)->netsig_len = 0;
11998 				bzero(IN_IFEXTRA(ifp)->netsig,
11999 				    sizeof(IN_IFEXTRA(ifp)->netsig));
12000 				if_inetdata_lock_done(ifp);
12001 				break;
12002 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
12003 				error = EINVAL;
12004 				if_inetdata_lock_done(ifp);
12005 				break;
12006 			}
12007 			IN_IFEXTRA(ifp)->netsig_len = len;
12008 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
12009 		} else {
12010 			error = ENOMEM;
12011 		}
12012 		if_inetdata_lock_done(ifp);
12013 		break;
12014 
12015 	case AF_INET6:
12016 		if_inet6data_lock_exclusive(ifp);
12017 		if (IN6_IFEXTRA(ifp) != NULL) {
12018 			if (len == 0) {
12019 				/* Allow clearing the signature */
12020 				IN6_IFEXTRA(ifp)->netsig_len = 0;
12021 				bzero(IN6_IFEXTRA(ifp)->netsig,
12022 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
12023 				if_inet6data_lock_done(ifp);
12024 				break;
12025 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12026 				error = EINVAL;
12027 				if_inet6data_lock_done(ifp);
12028 				break;
12029 			}
12030 			IN6_IFEXTRA(ifp)->netsig_len = len;
12031 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12032 		} else {
12033 			error = ENOMEM;
12034 		}
12035 		if_inet6data_lock_done(ifp);
12036 		break;
12037 
12038 	default:
12039 		error = EINVAL;
12040 		break;
12041 	}
12042 
12043 	return error;
12044 }
12045 
12046 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12047 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12048     uint16_t *flags, uint8_t *data)
12049 {
12050 	int error = 0;
12051 
12052 	if (ifp == NULL || len == NULL || data == NULL) {
12053 		return EINVAL;
12054 	}
12055 
12056 	switch (family) {
12057 	case AF_INET:
12058 		if_inetdata_lock_shared(ifp);
12059 		if (IN_IFEXTRA(ifp) != NULL) {
12060 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12061 				error = EINVAL;
12062 				if_inetdata_lock_done(ifp);
12063 				break;
12064 			}
12065 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12066 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12067 			} else {
12068 				error = ENOENT;
12069 			}
12070 		} else {
12071 			error = ENOMEM;
12072 		}
12073 		if_inetdata_lock_done(ifp);
12074 		break;
12075 
12076 	case AF_INET6:
12077 		if_inet6data_lock_shared(ifp);
12078 		if (IN6_IFEXTRA(ifp) != NULL) {
12079 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12080 				error = EINVAL;
12081 				if_inet6data_lock_done(ifp);
12082 				break;
12083 			}
12084 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12085 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12086 			} else {
12087 				error = ENOENT;
12088 			}
12089 		} else {
12090 			error = ENOMEM;
12091 		}
12092 		if_inet6data_lock_done(ifp);
12093 		break;
12094 
12095 	default:
12096 		error = EINVAL;
12097 		break;
12098 	}
12099 
12100 	if (error == 0 && flags != NULL) {
12101 		*flags = 0;
12102 	}
12103 
12104 	return error;
12105 }
12106 
12107 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12108 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12109 {
12110 	int i, error = 0, one_set = 0;
12111 
12112 	if_inet6data_lock_exclusive(ifp);
12113 
12114 	if (IN6_IFEXTRA(ifp) == NULL) {
12115 		error = ENOMEM;
12116 		goto out;
12117 	}
12118 
12119 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12120 		uint32_t prefix_len =
12121 		    prefixes[i].prefix_len;
12122 		struct in6_addr *prefix =
12123 		    &prefixes[i].ipv6_prefix;
12124 
12125 		if (prefix_len == 0) {
12126 			clat_log0((LOG_DEBUG,
12127 			    "NAT64 prefixes purged from Interface %s\n",
12128 			    if_name(ifp)));
12129 			/* Allow clearing the signature */
12130 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12131 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12132 			    sizeof(struct in6_addr));
12133 
12134 			continue;
12135 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12136 		    prefix_len != NAT64_PREFIX_LEN_40 &&
12137 		    prefix_len != NAT64_PREFIX_LEN_48 &&
12138 		    prefix_len != NAT64_PREFIX_LEN_56 &&
12139 		    prefix_len != NAT64_PREFIX_LEN_64 &&
12140 		    prefix_len != NAT64_PREFIX_LEN_96) {
12141 			clat_log0((LOG_DEBUG,
12142 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
12143 			error = EINVAL;
12144 			goto out;
12145 		}
12146 
12147 		if (IN6_IS_SCOPE_EMBED(prefix)) {
12148 			clat_log0((LOG_DEBUG,
12149 			    "NAT64 prefix has interface/link local scope.\n"));
12150 			error = EINVAL;
12151 			goto out;
12152 		}
12153 
12154 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12155 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12156 		    sizeof(struct in6_addr));
12157 		clat_log0((LOG_DEBUG,
12158 		    "NAT64 prefix set to %s with prefixlen: %d\n",
12159 		    ip6_sprintf(prefix), prefix_len));
12160 		one_set = 1;
12161 	}
12162 
12163 out:
12164 	if_inet6data_lock_done(ifp);
12165 
12166 	if (error == 0 && one_set != 0) {
12167 		necp_update_all_clients();
12168 	}
12169 
12170 	return error;
12171 }
12172 
12173 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12174 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12175 {
12176 	int i, found_one = 0, error = 0;
12177 
12178 	if (ifp == NULL) {
12179 		return EINVAL;
12180 	}
12181 
12182 	if_inet6data_lock_shared(ifp);
12183 
12184 	if (IN6_IFEXTRA(ifp) == NULL) {
12185 		error = ENOMEM;
12186 		goto out;
12187 	}
12188 
12189 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12190 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12191 			found_one = 1;
12192 		}
12193 	}
12194 
12195 	if (found_one == 0) {
12196 		error = ENOENT;
12197 		goto out;
12198 	}
12199 
12200 	if (prefixes) {
12201 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12202 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12203 	}
12204 
12205 out:
12206 	if_inet6data_lock_done(ifp);
12207 
12208 	return error;
12209 }
12210 
12211 __attribute__((noinline))
12212 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12213 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12214     protocol_family_t pf)
12215 {
12216 #pragma unused(ifp)
12217 	uint32_t did_sw;
12218 
12219 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12220 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12221 		return;
12222 	}
12223 
12224 	switch (pf) {
12225 	case PF_INET:
12226 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12227 		if (did_sw & CSUM_DELAY_IP) {
12228 			hwcksum_dbg_finalized_hdr++;
12229 		}
12230 		if (did_sw & CSUM_DELAY_DATA) {
12231 			hwcksum_dbg_finalized_data++;
12232 		}
12233 		break;
12234 	case PF_INET6:
12235 		/*
12236 		 * Checksum offload should not have been enabled when
12237 		 * extension headers exist; that also means that we
12238 		 * cannot force-finalize packets with extension headers.
12239 		 * Indicate to the callee should it skip such case by
12240 		 * setting optlen to -1.
12241 		 */
12242 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12243 		    m->m_pkthdr.csum_flags);
12244 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
12245 			hwcksum_dbg_finalized_data++;
12246 		}
12247 		break;
12248 	default:
12249 		return;
12250 	}
12251 }
12252 
12253 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12254 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12255     protocol_family_t pf)
12256 {
12257 	uint16_t sum = 0;
12258 	uint32_t hlen;
12259 
12260 	if (frame_header == NULL ||
12261 	    frame_header < (char *)mbuf_datastart(m) ||
12262 	    frame_header > (char *)m->m_data) {
12263 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12264 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12265 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12266 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12267 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12268 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
12269 		return;
12270 	}
12271 	hlen = (uint32_t)(m->m_data - frame_header);
12272 
12273 	switch (pf) {
12274 	case PF_INET:
12275 	case PF_INET6:
12276 		break;
12277 	default:
12278 		return;
12279 	}
12280 
12281 	/*
12282 	 * Force partial checksum offload; useful to simulate cases
12283 	 * where the hardware does not support partial checksum offload,
12284 	 * in order to validate correctness throughout the layers above.
12285 	 */
12286 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12287 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12288 
12289 		if (foff > (uint32_t)m->m_pkthdr.len) {
12290 			return;
12291 		}
12292 
12293 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12294 
12295 		/* Compute 16-bit 1's complement sum from forced offset */
12296 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12297 
12298 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12299 		m->m_pkthdr.csum_rx_val = sum;
12300 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12301 
12302 		hwcksum_dbg_partial_forced++;
12303 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12304 	}
12305 
12306 	/*
12307 	 * Partial checksum offload verification (and adjustment);
12308 	 * useful to validate and test cases where the hardware
12309 	 * supports partial checksum offload.
12310 	 */
12311 	if ((m->m_pkthdr.csum_flags &
12312 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12313 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12314 		uint32_t rxoff;
12315 
12316 		/* Start offset must begin after frame header */
12317 		rxoff = m->m_pkthdr.csum_rx_start;
12318 		if (hlen > rxoff) {
12319 			hwcksum_dbg_bad_rxoff++;
12320 			if (dlil_verbose) {
12321 				DLIL_PRINTF("%s: partial cksum start offset %d "
12322 				    "is less than frame header length %d for "
12323 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12324 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
12325 			}
12326 			return;
12327 		}
12328 		rxoff -= hlen;
12329 
12330 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12331 			/*
12332 			 * Compute the expected 16-bit 1's complement sum;
12333 			 * skip this if we've already computed it above
12334 			 * when partial checksum offload is forced.
12335 			 */
12336 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12337 
12338 			/* Hardware or driver is buggy */
12339 			if (sum != m->m_pkthdr.csum_rx_val) {
12340 				hwcksum_dbg_bad_cksum++;
12341 				if (dlil_verbose) {
12342 					DLIL_PRINTF("%s: bad partial cksum value "
12343 					    "0x%x (expected 0x%x) for mbuf "
12344 					    "0x%llx [rx_start %d]\n",
12345 					    if_name(ifp),
12346 					    m->m_pkthdr.csum_rx_val, sum,
12347 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
12348 					    m->m_pkthdr.csum_rx_start);
12349 				}
12350 				return;
12351 			}
12352 		}
12353 		hwcksum_dbg_verified++;
12354 
12355 		/*
12356 		 * This code allows us to emulate various hardwares that
12357 		 * perform 16-bit 1's complement sum beginning at various
12358 		 * start offset values.
12359 		 */
12360 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12361 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12362 
12363 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12364 				return;
12365 			}
12366 
12367 			sum = m_adj_sum16(m, rxoff, aoff,
12368 			    m_pktlen(m) - aoff, sum);
12369 
12370 			m->m_pkthdr.csum_rx_val = sum;
12371 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12372 
12373 			hwcksum_dbg_adjusted++;
12374 		}
12375 	}
12376 }
12377 
12378 static int
12379 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12380 {
12381 #pragma unused(arg1, arg2)
12382 	u_int32_t i;
12383 	int err;
12384 
12385 	i = hwcksum_dbg_mode;
12386 
12387 	err = sysctl_handle_int(oidp, &i, 0, req);
12388 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12389 		return err;
12390 	}
12391 
12392 	if (hwcksum_dbg == 0) {
12393 		return ENODEV;
12394 	}
12395 
12396 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12397 		return EINVAL;
12398 	}
12399 
12400 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12401 
12402 	return err;
12403 }
12404 
12405 static int
12406 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12407 {
12408 #pragma unused(arg1, arg2)
12409 	u_int32_t i;
12410 	int err;
12411 
12412 	i = hwcksum_dbg_partial_rxoff_forced;
12413 
12414 	err = sysctl_handle_int(oidp, &i, 0, req);
12415 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12416 		return err;
12417 	}
12418 
12419 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12420 		return ENODEV;
12421 	}
12422 
12423 	hwcksum_dbg_partial_rxoff_forced = i;
12424 
12425 	return err;
12426 }
12427 
12428 static int
12429 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12430 {
12431 #pragma unused(arg1, arg2)
12432 	u_int32_t i;
12433 	int err;
12434 
12435 	i = hwcksum_dbg_partial_rxoff_adj;
12436 
12437 	err = sysctl_handle_int(oidp, &i, 0, req);
12438 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12439 		return err;
12440 	}
12441 
12442 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12443 		return ENODEV;
12444 	}
12445 
12446 	hwcksum_dbg_partial_rxoff_adj = i;
12447 
12448 	return err;
12449 }
12450 
12451 static int
12452 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12453 {
12454 #pragma unused(oidp, arg1, arg2)
12455 	int err;
12456 
12457 	if (req->oldptr == USER_ADDR_NULL) {
12458 	}
12459 	if (req->newptr != USER_ADDR_NULL) {
12460 		return EPERM;
12461 	}
12462 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12463 	    sizeof(struct chain_len_stats));
12464 
12465 	return err;
12466 }
12467 
12468 #if DEBUG || DEVELOPMENT
12469 /* Blob for sum16 verification */
12470 static uint8_t sumdata[] = {
12471 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12472 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12473 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12474 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12475 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12476 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12477 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12478 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12479 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12480 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12481 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12482 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12483 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12484 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12485 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12486 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12487 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12488 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12489 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12490 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12491 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12492 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12493 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12494 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12495 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12496 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12497 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12498 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12499 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12500 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12501 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12502 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12503 	0xc8, 0x28, 0x02, 0x00, 0x00
12504 };
12505 
12506 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12507 static struct {
12508 	boolean_t       init;
12509 	uint16_t        len;
12510 	uint16_t        sumr;   /* reference */
12511 	uint16_t        sumrp;  /* reference, precomputed */
12512 } sumtbl[] = {
12513 	{ FALSE, 0, 0, 0x0000 },
12514 	{ FALSE, 1, 0, 0x001f },
12515 	{ FALSE, 2, 0, 0x8b1f },
12516 	{ FALSE, 3, 0, 0x8b27 },
12517 	{ FALSE, 7, 0, 0x790e },
12518 	{ FALSE, 11, 0, 0xcb6d },
12519 	{ FALSE, 20, 0, 0x20dd },
12520 	{ FALSE, 27, 0, 0xbabd },
12521 	{ FALSE, 32, 0, 0xf3e8 },
12522 	{ FALSE, 37, 0, 0x197d },
12523 	{ FALSE, 43, 0, 0x9eae },
12524 	{ FALSE, 64, 0, 0x4678 },
12525 	{ FALSE, 127, 0, 0x9399 },
12526 	{ FALSE, 256, 0, 0xd147 },
12527 	{ FALSE, 325, 0, 0x0358 },
12528 };
12529 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12530 
12531 static void
dlil_verify_sum16(void)12532 dlil_verify_sum16(void)
12533 {
12534 	struct mbuf *m;
12535 	uint8_t *buf;
12536 	int n;
12537 
12538 	/* Make sure test data plus extra room for alignment fits in cluster */
12539 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12540 
12541 	kprintf("DLIL: running SUM16 self-tests ... ");
12542 
12543 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12544 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12545 
12546 	buf = mtod(m, uint8_t *);               /* base address */
12547 
12548 	for (n = 0; n < SUMTBL_MAX; n++) {
12549 		uint16_t len = sumtbl[n].len;
12550 		int i;
12551 
12552 		/* Verify for all possible alignments */
12553 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12554 			uint16_t sum, sumr;
12555 			uint8_t *c;
12556 
12557 			/* Copy over test data to mbuf */
12558 			VERIFY(len <= sizeof(sumdata));
12559 			c = buf + i;
12560 			bcopy(sumdata, c, len);
12561 
12562 			/* Zero-offset test (align by data pointer) */
12563 			m->m_data = (caddr_t)c;
12564 			m->m_len = len;
12565 			sum = m_sum16(m, 0, len);
12566 
12567 			if (!sumtbl[n].init) {
12568 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12569 				sumtbl[n].sumr = sumr;
12570 				sumtbl[n].init = TRUE;
12571 			} else {
12572 				sumr = sumtbl[n].sumr;
12573 			}
12574 
12575 			/* Something is horribly broken; stop now */
12576 			if (sumr != sumtbl[n].sumrp) {
12577 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12578 				    "for len=%d align=%d sum=0x%04x "
12579 				    "[expected=0x%04x]\n", __func__,
12580 				    len, i, sum, sumr);
12581 				/* NOTREACHED */
12582 			} else if (sum != sumr) {
12583 				panic_plain("\n%s: broken m_sum16() for len=%d "
12584 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12585 				    __func__, len, i, sum, sumr);
12586 				/* NOTREACHED */
12587 			}
12588 
12589 			/* Alignment test by offset (fixed data pointer) */
12590 			m->m_data = (caddr_t)buf;
12591 			m->m_len = i + len;
12592 			sum = m_sum16(m, i, len);
12593 
12594 			/* Something is horribly broken; stop now */
12595 			if (sum != sumr) {
12596 				panic_plain("\n%s: broken m_sum16() for len=%d "
12597 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12598 				    __func__, len, i, sum, sumr);
12599 				/* NOTREACHED */
12600 			}
12601 #if INET
12602 			/* Simple sum16 contiguous buffer test by aligment */
12603 			sum = b_sum16(c, len);
12604 
12605 			/* Something is horribly broken; stop now */
12606 			if (sum != sumr) {
12607 				panic_plain("\n%s: broken b_sum16() for len=%d "
12608 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12609 				    __func__, len, i, sum, sumr);
12610 				/* NOTREACHED */
12611 			}
12612 #endif /* INET */
12613 		}
12614 	}
12615 	m_freem(m);
12616 
12617 	kprintf("PASSED\n");
12618 }
12619 #endif /* DEBUG || DEVELOPMENT */
12620 
12621 #define CASE_STRINGIFY(x) case x: return #x
12622 
12623 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12624 dlil_kev_dl_code_str(u_int32_t event_code)
12625 {
12626 	switch (event_code) {
12627 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12628 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12629 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12630 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12631 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12632 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12633 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12634 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12635 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12636 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12637 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12638 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12639 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12640 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12641 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12642 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12643 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12644 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12645 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12646 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12647 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12648 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12649 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12650 		CASE_STRINGIFY(KEV_DL_ISSUES);
12651 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12652 	default:
12653 		break;
12654 	}
12655 	return "";
12656 }
12657 
12658 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12659 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12660 {
12661 #pragma unused(arg1)
12662 	struct ifnet *ifp = arg0;
12663 
12664 	if (ifnet_is_attached(ifp, 1)) {
12665 		nstat_ifnet_threshold_reached(ifp->if_index);
12666 		ifnet_decr_iorefcnt(ifp);
12667 	}
12668 }
12669 
12670 void
ifnet_notify_data_threshold(struct ifnet * ifp)12671 ifnet_notify_data_threshold(struct ifnet *ifp)
12672 {
12673 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12674 	uint64_t oldbytes = ifp->if_dt_bytes;
12675 
12676 	ASSERT(ifp->if_dt_tcall != NULL);
12677 
12678 	/*
12679 	 * If we went over the threshold, notify NetworkStatistics.
12680 	 * We rate-limit it based on the threshold interval value.
12681 	 */
12682 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12683 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12684 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12685 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12686 		uint64_t now = mach_absolute_time(), deadline = now;
12687 		uint64_t ival;
12688 
12689 		if (tival != 0) {
12690 			nanoseconds_to_absolutetime(tival, &ival);
12691 			clock_deadline_for_periodic_event(ival, now, &deadline);
12692 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12693 			    deadline);
12694 		} else {
12695 			(void) thread_call_enter(ifp->if_dt_tcall);
12696 		}
12697 	}
12698 }
12699 
12700 #if (DEVELOPMENT || DEBUG)
12701 /*
12702  * The sysctl variable name contains the input parameters of
12703  * ifnet_get_keepalive_offload_frames()
12704  *  ifp (interface index): name[0]
12705  *  frames_array_count:    name[1]
12706  *  frame_data_offset:     name[2]
12707  * The return length gives used_frames_count
12708  */
12709 static int
12710 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12711 {
12712 #pragma unused(oidp)
12713 	int *name = (int *)arg1;
12714 	u_int namelen = arg2;
12715 	int idx;
12716 	ifnet_t ifp = NULL;
12717 	u_int32_t frames_array_count;
12718 	size_t frame_data_offset;
12719 	u_int32_t used_frames_count;
12720 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12721 	int error = 0;
12722 	u_int32_t i;
12723 
12724 	/*
12725 	 * Only root can get look at other people TCP frames
12726 	 */
12727 	error = proc_suser(current_proc());
12728 	if (error != 0) {
12729 		goto done;
12730 	}
12731 	/*
12732 	 * Validate the input parameters
12733 	 */
12734 	if (req->newptr != USER_ADDR_NULL) {
12735 		error = EPERM;
12736 		goto done;
12737 	}
12738 	if (namelen != 3) {
12739 		error = EINVAL;
12740 		goto done;
12741 	}
12742 	if (req->oldptr == USER_ADDR_NULL) {
12743 		error = EINVAL;
12744 		goto done;
12745 	}
12746 	if (req->oldlen == 0) {
12747 		error = EINVAL;
12748 		goto done;
12749 	}
12750 	idx = name[0];
12751 	frames_array_count = name[1];
12752 	frame_data_offset = name[2];
12753 
12754 	/* Make sure the passed buffer is large enough */
12755 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12756 	    req->oldlen) {
12757 		error = ENOMEM;
12758 		goto done;
12759 	}
12760 
12761 	ifnet_head_lock_shared();
12762 	if (!IF_INDEX_IN_RANGE(idx)) {
12763 		ifnet_head_done();
12764 		error = ENOENT;
12765 		goto done;
12766 	}
12767 	ifp = ifindex2ifnet[idx];
12768 	ifnet_head_done();
12769 
12770 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12771 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12772 		Z_WAITOK);
12773 	if (frames_array == NULL) {
12774 		error = ENOMEM;
12775 		goto done;
12776 	}
12777 
12778 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12779 	    frames_array_count, frame_data_offset, &used_frames_count);
12780 	if (error != 0) {
12781 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12782 		    __func__, error);
12783 		goto done;
12784 	}
12785 
12786 	for (i = 0; i < used_frames_count; i++) {
12787 		error = SYSCTL_OUT(req, frames_array + i,
12788 		    sizeof(struct ifnet_keepalive_offload_frame));
12789 		if (error != 0) {
12790 			goto done;
12791 		}
12792 	}
12793 done:
12794 	if (frames_array != NULL) {
12795 		kfree_data(frames_array, frames_array_count *
12796 		    sizeof(struct ifnet_keepalive_offload_frame));
12797 	}
12798 	return error;
12799 }
12800 #endif /* DEVELOPMENT || DEBUG */
12801 
12802 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12803 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12804     struct ifnet *ifp)
12805 {
12806 	tcp_update_stats_per_flow(ifs, ifp);
12807 }
12808 
12809 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12810 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12811 {
12812 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12813 }
12814 
12815 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12816 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12817 {
12818 	OSBitAndAtomic(~clear_flags, flags_p);
12819 }
12820 
12821 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12822 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12823 {
12824 	return _set_flags(&interface->if_eflags, set_flags);
12825 }
12826 
12827 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12828 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12829 {
12830 	_clear_flags(&interface->if_eflags, clear_flags);
12831 }
12832 
12833 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12834 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12835 {
12836 	return _set_flags(&interface->if_xflags, set_flags);
12837 }
12838 
12839 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12840 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12841 {
12842 	_clear_flags(&interface->if_xflags, clear_flags);
12843 }
12844 
12845 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12846 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12847 {
12848 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12849 }
12850 
12851 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12852 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12853 {
12854 	if (*genid != ifp->if_traffic_rule_genid) {
12855 		*genid = ifp->if_traffic_rule_genid;
12856 		return TRUE;
12857 	}
12858 	return FALSE;
12859 }
12860 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12861 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12862 {
12863 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12864 	ifnet_update_traffic_rule_genid(ifp);
12865 }
12866 
12867 static void
log_hexdump(void * data,size_t len)12868 log_hexdump(void *data, size_t len)
12869 {
12870 	size_t i, j, k;
12871 	unsigned char *ptr = (unsigned char *)data;
12872 #define MAX_DUMP_BUF 32
12873 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12874 
12875 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12876 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12877 			unsigned char msnbl = ptr[j] >> 4;
12878 			unsigned char lsnbl = ptr[j] & 0x0f;
12879 
12880 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12881 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12882 
12883 			if ((j % 2) == 1) {
12884 				buf[k++] = ' ';
12885 			}
12886 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12887 				buf[k++] = ' ';
12888 			}
12889 		}
12890 		buf[k] = 0;
12891 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12892 	}
12893 }
12894 
12895 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12896 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12897 net_check_compatible_if_filter(struct ifnet *ifp)
12898 {
12899 	if (ifp == NULL) {
12900 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12901 			return false;
12902 		}
12903 	} else {
12904 		if (ifp->if_flt_non_os_count > 0) {
12905 			return false;
12906 		}
12907 	}
12908 	return true;
12909 }
12910 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12911 
12912 #define DUMP_BUF_CHK() {        \
12913 	clen -= k;              \
12914 	if (clen < 1)           \
12915 	        goto done;      \
12916 	c += k;                 \
12917 }
12918 
12919 int dlil_dump_top_if_qlen(char *, int);
12920 int
dlil_dump_top_if_qlen(char * str,int str_len)12921 dlil_dump_top_if_qlen(char *str, int str_len)
12922 {
12923 	char *c = str;
12924 	int k, clen = str_len;
12925 	struct ifnet *top_ifcq_ifp = NULL;
12926 	uint32_t top_ifcq_len = 0;
12927 	struct ifnet *top_inq_ifp = NULL;
12928 	uint32_t top_inq_len = 0;
12929 
12930 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12931 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12932 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12933 
12934 		if (ifp == NULL) {
12935 			continue;
12936 		}
12937 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12938 			top_ifcq_len = ifp->if_snd->ifcq_len;
12939 			top_ifcq_ifp = ifp;
12940 		}
12941 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12942 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12943 			top_inq_ifp = ifp;
12944 		}
12945 	}
12946 
12947 	if (top_ifcq_ifp != NULL) {
12948 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12949 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12950 		DUMP_BUF_CHK();
12951 	}
12952 	if (top_inq_ifp != NULL) {
12953 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12954 		    top_inq_len, top_inq_ifp->if_xname);
12955 		DUMP_BUF_CHK();
12956 	}
12957 done:
12958 	return str_len - clen;
12959 }
12960