xref: /xnu-10002.41.9/bsd/net/dlil.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63 
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70 
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR        4 /* LONGWORDS */
154 
155 #if 1
156 #define DLIL_PRINTF     printf
157 #else
158 #define DLIL_PRINTF     kprintf
159 #endif
160 
161 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
162 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
163 
164 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
165 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
166 
167 enum {
168 	kProtoKPI_v1    = 1,
169 	kProtoKPI_v2    = 2
170 };
171 
172 uint64_t if_creation_generation_count = 0;
173 
174 /*
175  * List of if_proto structures in if_proto_hash[] is protected by
176  * the ifnet lock.  The rest of the fields are initialized at protocol
177  * attach time and never change, thus no lock required as long as
178  * a reference to it is valid, via if_proto_ref().
179  */
180 struct if_proto {
181 	SLIST_ENTRY(if_proto)       next_hash;
182 	u_int32_t                   refcount;
183 	u_int32_t                   detached;
184 	struct ifnet                *ifp;
185 	protocol_family_t           protocol_family;
186 	int                         proto_kpi;
187 	union {
188 		struct {
189 			proto_media_input               input;
190 			proto_media_preout              pre_output;
191 			proto_media_event               event;
192 			proto_media_ioctl               ioctl;
193 			proto_media_detached            detached;
194 			proto_media_resolve_multi       resolve_multi;
195 			proto_media_send_arp            send_arp;
196 		} v1;
197 		struct {
198 			proto_media_input_v2            input;
199 			proto_media_preout              pre_output;
200 			proto_media_event               event;
201 			proto_media_ioctl               ioctl;
202 			proto_media_detached            detached;
203 			proto_media_resolve_multi       resolve_multi;
204 			proto_media_send_arp            send_arp;
205 		} v2;
206 	} kpi;
207 };
208 
209 SLIST_HEAD(proto_hash_entry, if_proto);
210 
211 #define DLIL_SDLDATALEN \
212 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
213 
214 struct dlil_ifnet {
215 	struct ifnet    dl_if;                  /* public ifnet */
216 	/*
217 	 * DLIL private fields, protected by dl_if_lock
218 	 */
219 	decl_lck_mtx_data(, dl_if_lock);
220 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
221 	u_int32_t dl_if_flags;                  /* flags (below) */
222 	u_int32_t dl_if_refcnt;                 /* refcnt */
223 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
224 	void    *dl_if_uniqueid;                /* unique interface id */
225 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
226 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
227 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
228 	struct {
229 		struct ifaddr   ifa;            /* lladdr ifa */
230 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
231 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
232 	} dl_if_lladdr;
233 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
234 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
235 	u_int8_t dl_if_permanent_ether_is_set;
236 	u_int8_t dl_if_unused;
237 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
238 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
239 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
240 };
241 
242 /* Values for dl_if_flags (private to DLIL) */
243 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
244 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
245 #define DLIF_DEBUG      0x4     /* has debugging info */
246 
247 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
248 
249 /* For gdb */
250 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
251 
252 struct dlil_ifnet_dbg {
253 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
254 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
255 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
256 	/*
257 	 * Circular lists of ifnet_{reference,release} callers.
258 	 */
259 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
260 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
261 };
262 
263 #define DLIL_TO_IFP(s)  (&s->dl_if)
264 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
265 
266 struct ifnet_filter {
267 	TAILQ_ENTRY(ifnet_filter)       filt_next;
268 	u_int32_t                       filt_skip;
269 	u_int32_t                       filt_flags;
270 	ifnet_t                         filt_ifp;
271 	const char                      *filt_name;
272 	void                            *filt_cookie;
273 	protocol_family_t               filt_protocol;
274 	iff_input_func                  filt_input;
275 	iff_output_func                 filt_output;
276 	iff_event_func                  filt_event;
277 	iff_ioctl_func                  filt_ioctl;
278 	iff_detached_func               filt_detached;
279 };
280 
281 /* Mbuf queue used for freeing the excessive mbufs */
282 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
283 
284 struct proto_input_entry;
285 
286 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
287 
288 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
289 
290 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
291 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
292 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
293 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
294 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
295 
296 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
297 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
298     &dlil_lck_attributes);
299 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
300     &dlil_lck_attributes);
301 
302 #if DEBUG
303 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
304 #else
305 static unsigned int ifnet_debug;        /* debugging (disabled) */
306 #endif /* !DEBUG */
307 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
308 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
309 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
310 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
311 
312 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
313 
314 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
315 
316 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
317 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
318 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
319 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
320 
321 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
322 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
323 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
324 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
325 
326 static u_int32_t net_rtref;
327 
328 static struct dlil_main_threading_info dlil_main_input_thread_info;
329 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
330     (struct dlil_threading_info *)&dlil_main_input_thread_info;
331 
332 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
333 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
334 static void dlil_if_trace(struct dlil_ifnet *, int);
335 static void if_proto_ref(struct if_proto *);
336 static void if_proto_free(struct if_proto *);
337 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
338 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
339     u_int32_t list_count);
340 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
341 static void if_flt_monitor_busy(struct ifnet *);
342 static void if_flt_monitor_unbusy(struct ifnet *);
343 static void if_flt_monitor_enter(struct ifnet *);
344 static void if_flt_monitor_leave(struct ifnet *);
345 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
346     char **, protocol_family_t);
347 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
348     protocol_family_t);
349 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
350     const struct sockaddr_dl *);
351 static int ifnet_lookup(struct ifnet *);
352 static void if_purgeaddrs(struct ifnet *);
353 
354 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
355     struct mbuf *, char *);
356 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
357     struct mbuf *);
358 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
359     mbuf_t *, const struct sockaddr *, void *, char *, char *);
360 static void ifproto_media_event(struct ifnet *, protocol_family_t,
361     const struct kev_msg *);
362 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
363     unsigned long, void *);
364 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
365     struct sockaddr_dl *, size_t);
366 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
367     const struct sockaddr_dl *, const struct sockaddr *,
368     const struct sockaddr_dl *, const struct sockaddr *);
369 
370 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
371     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
372     boolean_t poll, struct thread *tp);
373 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
374     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
375 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
376 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
377     protocol_family_t *);
378 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
379     const struct ifnet_demux_desc *, u_int32_t);
380 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
381 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
382 #if !XNU_TARGET_OS_OSX
383 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
384     const struct sockaddr *, const char *, const char *,
385     u_int32_t *, u_int32_t *);
386 #else /* XNU_TARGET_OS_OSX */
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388     const struct sockaddr *, const char *, const char *);
389 #endif /* XNU_TARGET_OS_OSX */
390 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
391     const struct sockaddr *, const char *, const char *,
392     u_int32_t *, u_int32_t *);
393 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
394 static void ifp_if_free(struct ifnet *);
395 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
396 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
397 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
398 
399 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
400     dlil_freeq_t *, struct ifnet_stat_increment_param *);
401 
402 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
403     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
404     boolean_t, struct thread *);
405 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
406     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
407     boolean_t, struct thread *);
408 
409 static void dlil_main_input_thread_func(void *, wait_result_t);
410 static void dlil_main_input_thread_cont(void *, wait_result_t);
411 
412 static void dlil_input_thread_func(void *, wait_result_t);
413 static void dlil_input_thread_cont(void *, wait_result_t);
414 
415 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
416 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
417 
418 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
419     thread_continue_t *);
420 static void dlil_terminate_input_thread(struct dlil_threading_info *);
421 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
422     struct dlil_threading_info *, struct ifnet *, boolean_t);
423 static boolean_t dlil_input_stats_sync(struct ifnet *,
424     struct dlil_threading_info *);
425 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
426     u_int32_t, ifnet_model_t, boolean_t);
427 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
428     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
429 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
430 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
431 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
432 #if DEBUG || DEVELOPMENT
433 static void dlil_verify_sum16(void);
434 #endif /* DEBUG || DEVELOPMENT */
435 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
436     protocol_family_t);
437 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
438     protocol_family_t);
439 
440 static void dlil_incr_pending_thread_count(void);
441 static void dlil_decr_pending_thread_count(void);
442 
443 static void ifnet_detacher_thread_func(void *, wait_result_t);
444 static void ifnet_detacher_thread_cont(void *, wait_result_t);
445 static void ifnet_detach_final(struct ifnet *);
446 static void ifnet_detaching_enqueue(struct ifnet *);
447 static struct ifnet *ifnet_detaching_dequeue(void);
448 
449 static void ifnet_start_thread_func(void *, wait_result_t);
450 static void ifnet_start_thread_cont(void *, wait_result_t);
451 
452 static void ifnet_poll_thread_func(void *, wait_result_t);
453 static void ifnet_poll_thread_cont(void *, wait_result_t);
454 
455 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
456     classq_pkt_t *, boolean_t, boolean_t *);
457 
458 static void ifp_src_route_copyout(struct ifnet *, struct route *);
459 static void ifp_src_route_copyin(struct ifnet *, struct route *);
460 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
461 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
462 
463 static errno_t if_mcasts_update_async(struct ifnet *);
464 
465 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
471 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
475 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
478 
479 struct chain_len_stats tx_chain_len_stats;
480 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
481 
482 #if TEST_INPUT_THREAD_TERMINATION
483 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
484 #endif /* TEST_INPUT_THREAD_TERMINATION */
485 
486 /* The following are protected by dlil_ifnet_lock */
487 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
488 static u_int32_t ifnet_detaching_cnt;
489 static boolean_t ifnet_detaching_embryonic;
490 static void *ifnet_delayed_run; /* wait channel for detaching thread */
491 
492 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
493     &dlil_lck_attributes);
494 
495 static uint32_t ifnet_flowhash_seed;
496 
497 struct ifnet_flowhash_key {
498 	char            ifk_name[IFNAMSIZ];
499 	uint32_t        ifk_unit;
500 	uint32_t        ifk_flags;
501 	uint32_t        ifk_eflags;
502 	uint32_t        ifk_capabilities;
503 	uint32_t        ifk_capenable;
504 	uint32_t        ifk_output_sched_model;
505 	uint32_t        ifk_rand1;
506 	uint32_t        ifk_rand2;
507 };
508 
509 /* Flow control entry per interface */
510 struct ifnet_fc_entry {
511 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
512 	u_int32_t       ifce_flowhash;
513 	struct ifnet    *ifce_ifp;
514 };
515 
516 static uint32_t ifnet_calc_flowhash(struct ifnet *);
517 static int ifce_cmp(const struct ifnet_fc_entry *,
518     const struct ifnet_fc_entry *);
519 static int ifnet_fc_add(struct ifnet *);
520 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
521 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
522 
523 /* protected by ifnet_fc_lock */
524 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
525 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
526 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527 
528 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
529 
530 extern void bpfdetach(struct ifnet *);
531 extern void proto_input_run(void);
532 
533 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
534     u_int32_t flags);
535 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
536     u_int32_t flags);
537 
538 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
539 
540 #if CONFIG_MACF
541 #if !XNU_TARGET_OS_OSX
542 int dlil_lladdr_ckreq = 1;
543 #else /* XNU_TARGET_OS_OSX */
544 int dlil_lladdr_ckreq = 0;
545 #endif /* XNU_TARGET_OS_OSX */
546 #endif /* CONFIG_MACF */
547 
548 #if DEBUG
549 int dlil_verbose = 1;
550 #else
551 int dlil_verbose = 0;
552 #endif /* DEBUG */
553 #if IFNET_INPUT_SANITY_CHK
554 /* sanity checking of input packet lists received */
555 static u_int32_t dlil_input_sanity_check = 0;
556 #endif /* IFNET_INPUT_SANITY_CHK */
557 /* rate limit debug messages */
558 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
559 
560 SYSCTL_DECL(_net_link_generic_system);
561 
562 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
563     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
564 
565 #define IF_SNDQ_MINLEN  32
566 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
568     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
569     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
570 
571 #define IF_RCVQ_MINLEN  32
572 #define IF_RCVQ_MAXLEN  256
573 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
574 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
575     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
576     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
577 
578 /*
579  * Protect against possible memory starvation that may happen
580  * when the driver is pushing data faster than the AP can process.
581  *
582  * If at any point during DLIL input phase any of the input queues
583  * exceeds the burst limit, DLIL will start to trim the queue,
584  * by returning mbufs in the input queue to the cache from which
585  * the mbufs were originally allocated, starting from the oldest
586  * mbuf and continuing until the new limit (see below) is reached.
587  *
588  * In order to avoid a steplocked equilibrium, the trimming
589  * will continue PAST the burst limit, until the corresponding
590  * input queue is reduced to `if_rcvq_trim_pct' %.
591  *
592  * For example, if the input queue limit is 1024 packets,
593  * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
594  * the trimming will continue until the queue contains 819 packets
595  * (1024 * 80 / 100 == 819).
596  *
597  * Setting the burst limit too low can hurt the throughput,
598  * while setting the burst limit too high can defeat the purpose.
599  */
600 #define IF_RCVQ_BURST_LIMIT_MIN         1024
601 #define IF_RCVQ_BURST_LIMIT_DEFAULT     8192
602 #define IF_RCVQ_BURST_LIMIT_MAX         32768
603 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
604 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
605     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
606     sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
607 
608 #define IF_RCVQ_TRIM_PCT_MIN            20
609 #define IF_RCVQ_TRIM_PCT_DEFAULT        80
610 #define IF_RCVQ_TRIM_PCT_MAX            100
611 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
612 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
613     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
614     sysctl_rcvq_trim_pct, "I",
615     "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
616 
617 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
618 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
619 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
620     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
621     "ilog2 of EWMA decay rate of avg inbound packets");
622 
623 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
624 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
625 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
626 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
627     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
628     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
629     "Q", "input poll mode freeze time");
630 
631 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
632 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
633 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
634 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
635     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
636     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
637     "Q", "input poll sampling time");
638 
639 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
640 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
641     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
642     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
643     "Q", "input poll interval (time)");
644 
645 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
646 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
647 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
648     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
649     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
650 
651 #define IF_RXPOLL_WLOWAT        10
652 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
653 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
654     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
655     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
656     "I", "input poll wakeup low watermark");
657 
658 #define IF_RXPOLL_WHIWAT        100
659 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
660 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
661     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
662     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
663     "I", "input poll wakeup high watermark");
664 
665 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
666 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
667     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
668     "max packets per poll call");
669 
670 u_int32_t if_rxpoll = 1;
671 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
672     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
673     sysctl_rxpoll, "I", "enable opportunistic input polling");
674 
675 #if TEST_INPUT_THREAD_TERMINATION
676 static u_int32_t if_input_thread_termination_spin = 0;
677 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
678     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
679     &if_input_thread_termination_spin, 0,
680     sysctl_input_thread_termination_spin,
681     "I", "input thread termination spin limit");
682 #endif /* TEST_INPUT_THREAD_TERMINATION */
683 
684 static u_int32_t cur_dlil_input_threads = 0;
685 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
686     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
687     "Current number of DLIL input threads");
688 
689 #if IFNET_INPUT_SANITY_CHK
690 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
691     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
692     "Turn on sanity checking in DLIL input");
693 #endif /* IFNET_INPUT_SANITY_CHK */
694 
695 static u_int32_t if_flowadv = 1;
696 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
697     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
698     "enable flow-advisory mechanism");
699 
700 static u_int32_t if_delaybased_queue = 1;
701 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
702     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
703     "enable delay based dynamic queue sizing");
704 
705 static uint64_t hwcksum_in_invalidated = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
708     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
709 
710 uint32_t hwcksum_dbg = 0;
711 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
712     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
713     "enable hardware cksum debugging");
714 
715 u_int32_t ifnet_start_delayed = 0;
716 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
717     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
718     "number of times start was delayed");
719 
720 u_int32_t ifnet_delay_start_disabled = 0;
721 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
722     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
723     "number of times start was delayed");
724 
725 static inline void
ifnet_delay_start_disabled_increment(void)726 ifnet_delay_start_disabled_increment(void)
727 {
728 	OSIncrementAtomic(&ifnet_delay_start_disabled);
729 }
730 
731 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
732 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
733 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
734 #define HWCKSUM_DBG_MASK \
735 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
736 	HWCKSUM_DBG_FINALIZE_FORCED)
737 
738 static uint32_t hwcksum_dbg_mode = 0;
739 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
740     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
741     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
742 
743 static uint64_t hwcksum_dbg_partial_forced = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
746     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
747 
748 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
751     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
752 
753 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
754 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
755     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
756     &hwcksum_dbg_partial_rxoff_forced, 0,
757     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
758     "forced partial cksum rx offset");
759 
760 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
761 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
762     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
763     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
764     "adjusted partial cksum rx offset");
765 
766 static uint64_t hwcksum_dbg_verified = 0;
767 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
768     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
769     &hwcksum_dbg_verified, "packets verified for having good checksum");
770 
771 static uint64_t hwcksum_dbg_bad_cksum = 0;
772 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
773     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
774     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
775 
776 static uint64_t hwcksum_dbg_bad_rxoff = 0;
777 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
778     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
779     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
780 
781 static uint64_t hwcksum_dbg_adjusted = 0;
782 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
783     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
784     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
785 
786 static uint64_t hwcksum_dbg_finalized_hdr = 0;
787 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
789     &hwcksum_dbg_finalized_hdr, "finalized headers");
790 
791 static uint64_t hwcksum_dbg_finalized_data = 0;
792 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
794     &hwcksum_dbg_finalized_data, "finalized payloads");
795 
796 uint32_t hwcksum_tx = 1;
797 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
798     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
799     "enable transmit hardware checksum offload");
800 
801 uint32_t hwcksum_rx = 1;
802 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
803     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
804     "enable receive hardware checksum offload");
805 
806 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
807     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
808     sysctl_tx_chain_len_stats, "S", "");
809 
810 uint32_t tx_chain_len_count = 0;
811 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
812     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
813 
814 static uint32_t threshold_notify = 1;           /* enable/disable */
815 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
816     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
817 
818 static uint32_t threshold_interval = 2;         /* in seconds */
819 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
820     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
821 
822 #if (DEVELOPMENT || DEBUG)
823 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
824 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
825     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
826 #endif /* DEVELOPMENT || DEBUG */
827 
828 struct net_api_stats net_api_stats;
829 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
830     &net_api_stats, net_api_stats, "");
831 
832 uint32_t net_wake_pkt_debug = 0;
833 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
834     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
835 
836 static void log_hexdump(void *data, size_t len);
837 
838 unsigned int net_rxpoll = 1;
839 unsigned int net_affinity = 1;
840 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
841 
842 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
843 
844 extern u_int32_t        inject_buckets;
845 
846 /* DLIL data threshold thread call */
847 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
848 
849 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)850 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
851 {
852 	/*
853 	 * update filter count and route_generation ID to let TCP
854 	 * know it should reevalute doing TSO or not
855 	 */
856 	if (filter_enable) {
857 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
858 	} else {
859 		VERIFY(ifp->if_flt_no_tso_count != 0);
860 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
861 	}
862 	routegenid_update();
863 }
864 
865 #if SKYWALK
866 
867 #if defined(XNU_TARGET_OS_OSX)
868 static bool net_check_compatible_if_filter(struct ifnet *ifp);
869 #endif /* XNU_TARGET_OS_OSX */
870 
871 /* if_attach_nx flags defined in os_skywalk_private.h */
872 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
873 unsigned int if_enable_fsw_ip_netagent =
874     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
875 unsigned int if_enable_fsw_transport_netagent =
876     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
877 
878 unsigned int if_netif_all =
879     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
880 
881 /* Configure flowswitch to use max mtu sized buffer */
882 static bool fsw_use_max_mtu_buffer = false;
883 
884 #if (DEVELOPMENT || DEBUG)
885 static int
886 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
887 {
888 #pragma unused(oidp, arg1, arg2)
889 	unsigned int new_value;
890 	int changed;
891 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
892 	    &new_value, &changed);
893 	if (error) {
894 		return error;
895 	}
896 	if (changed) {
897 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
898 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
899 			return ENOTSUP;
900 		}
901 		if_attach_nx = new_value;
902 	}
903 	return 0;
904 }
905 
906 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
907     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
908     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
909 
910 #endif /* DEVELOPMENT || DEBUG */
911 
912 static int
913 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
914 {
915 #pragma unused(oidp, arg1, arg2)
916 	unsigned int new_value;
917 	int changed;
918 	int error;
919 
920 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
921 	    sizeof(if_enable_fsw_transport_netagent),
922 	    &new_value, &changed);
923 	if (error == 0 && changed != 0) {
924 		if (new_value != 0 && new_value != 1) {
925 			/* only allow 0 or 1 */
926 			error = EINVAL;
927 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
928 			/* netagent can be enabled/disabled */
929 			if_enable_fsw_transport_netagent = new_value;
930 			if (new_value == 0) {
931 				kern_nexus_deregister_netagents();
932 			} else {
933 				kern_nexus_register_netagents();
934 			}
935 		} else {
936 			/* netagent can't be enabled */
937 			error = ENOTSUP;
938 		}
939 	}
940 	return error;
941 }
942 
943 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
944     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
945     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
946     "enable flowswitch netagent");
947 
948 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
949 
950 #include <skywalk/os_skywalk_private.h>
951 
952 boolean_t
ifnet_nx_noauto(ifnet_t ifp)953 ifnet_nx_noauto(ifnet_t ifp)
954 {
955 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
956 }
957 
958 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)959 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
960 {
961 	return ifnet_is_low_latency(ifp);
962 }
963 
964 boolean_t
ifnet_is_low_latency(ifnet_t ifp)965 ifnet_is_low_latency(ifnet_t ifp)
966 {
967 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
968 }
969 
970 boolean_t
ifnet_needs_compat(ifnet_t ifp)971 ifnet_needs_compat(ifnet_t ifp)
972 {
973 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
974 		return FALSE;
975 	}
976 #if !XNU_TARGET_OS_OSX
977 	/*
978 	 * To conserve memory, we plumb in the compat layer selectively; this
979 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
980 	 * In particular, we check for Wi-Fi Access Point.
981 	 */
982 	if (IFNET_IS_WIFI(ifp)) {
983 		/* Wi-Fi Access Point */
984 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
985 		    ifp->if_name[2] == '\0') {
986 			return if_netif_all;
987 		}
988 	}
989 #else /* XNU_TARGET_OS_OSX */
990 #pragma unused(ifp)
991 #endif /* XNU_TARGET_OS_OSX */
992 	return TRUE;
993 }
994 
995 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)996 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
997 {
998 	if (if_is_fsw_transport_netagent_enabled()) {
999 		/* check if netagent has been manually enabled for ipsec/utun */
1000 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1001 			return ipsec_interface_needs_netagent(ifp);
1002 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1003 			return utun_interface_needs_netagent(ifp);
1004 		}
1005 
1006 		/* check ifnet no auto nexus override */
1007 		if (ifnet_nx_noauto(ifp)) {
1008 			return FALSE;
1009 		}
1010 
1011 		/* check global if_attach_nx configuration */
1012 		switch (ifp->if_family) {
1013 		case IFNET_FAMILY_CELLULAR:
1014 		case IFNET_FAMILY_ETHERNET:
1015 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1016 				return TRUE;
1017 			}
1018 			break;
1019 		default:
1020 			break;
1021 		}
1022 	}
1023 	return FALSE;
1024 }
1025 
1026 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1027 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1028 {
1029 #pragma unused(ifp)
1030 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1031 		return TRUE;
1032 	}
1033 	return FALSE;
1034 }
1035 
1036 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1037 ifnet_needs_netif_netagent(ifnet_t ifp)
1038 {
1039 #pragma unused(ifp)
1040 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1041 }
1042 
1043 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1044 dlil_detach_nexus_instance(nexus_controller_t controller,
1045     const char *func_str, uuid_t instance, uuid_t device)
1046 {
1047 	errno_t         err;
1048 
1049 	if (instance == NULL || uuid_is_null(instance)) {
1050 		return FALSE;
1051 	}
1052 
1053 	/* followed by the device port */
1054 	if (device != NULL && !uuid_is_null(device)) {
1055 		err = kern_nexus_ifdetach(controller, instance, device);
1056 		if (err != 0) {
1057 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1058 			    func_str, err);
1059 		}
1060 	}
1061 	err = kern_nexus_controller_free_provider_instance(controller,
1062 	    instance);
1063 	if (err != 0) {
1064 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1065 		    func_str, err);
1066 	}
1067 	return TRUE;
1068 }
1069 
1070 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1071 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1072     uuid_t device)
1073 {
1074 	boolean_t               detached = FALSE;
1075 	nexus_controller_t      controller = kern_nexus_shared_controller();
1076 	int                     err;
1077 
1078 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1079 	    device)) {
1080 		detached = TRUE;
1081 	}
1082 	if (provider != NULL && !uuid_is_null(provider)) {
1083 		detached = TRUE;
1084 		err = kern_nexus_controller_deregister_provider(controller,
1085 		    provider);
1086 		if (err != 0) {
1087 			DLIL_PRINTF("%s deregister_provider %d\n",
1088 			    func_str, err);
1089 		}
1090 	}
1091 	return detached;
1092 }
1093 
1094 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1095 dlil_create_provider_and_instance(nexus_controller_t controller,
1096     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1097     nexus_attr_t attr)
1098 {
1099 	uuid_t          dom_prov;
1100 	errno_t         err;
1101 	nexus_name_t    provider_name;
1102 	const char      *type_name =
1103 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1104 	struct kern_nexus_init init;
1105 
1106 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1107 	if (err != 0) {
1108 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1109 		    __func__, type_name, err);
1110 		goto failed;
1111 	}
1112 
1113 	snprintf((char *)provider_name, sizeof(provider_name),
1114 	    "com.apple.%s.%s", type_name, if_name(ifp));
1115 	err = kern_nexus_controller_register_provider(controller,
1116 	    dom_prov,
1117 	    provider_name,
1118 	    NULL,
1119 	    0,
1120 	    attr,
1121 	    provider);
1122 	if (err != 0) {
1123 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1124 		    __func__, type_name, err);
1125 		goto failed;
1126 	}
1127 	bzero(&init, sizeof(init));
1128 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1129 	err = kern_nexus_controller_alloc_provider_instance(controller,
1130 	    *provider,
1131 	    NULL, NULL,
1132 	    instance, &init);
1133 	if (err != 0) {
1134 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1135 		    __func__, type_name, err);
1136 		kern_nexus_controller_deregister_provider(controller,
1137 		    *provider);
1138 		goto failed;
1139 	}
1140 failed:
1141 	return err;
1142 }
1143 
1144 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1145 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1146 {
1147 	nexus_attr_t            attr = NULL;
1148 	nexus_controller_t      controller;
1149 	errno_t                 err;
1150 
1151 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1152 		/* it's already attached */
1153 		if (dlil_verbose) {
1154 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1155 			    __func__, if_name(ifp));
1156 			/* already attached */
1157 		}
1158 		goto failed;
1159 	}
1160 
1161 	err = kern_nexus_attr_create(&attr);
1162 	if (err != 0) {
1163 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1164 		    if_name(ifp));
1165 		goto failed;
1166 	}
1167 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1168 	VERIFY(err == 0);
1169 
1170 	controller = kern_nexus_shared_controller();
1171 
1172 	/* create the netif provider and instance */
1173 	err = dlil_create_provider_and_instance(controller,
1174 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1175 	    &netif_nx->if_nif_instance, attr);
1176 	if (err != 0) {
1177 		goto failed;
1178 	}
1179 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1180 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1181 	if (err != 0) {
1182 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1183 		    __func__, err);
1184 		/* cleanup provider and instance */
1185 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1186 		    netif_nx->if_nif_instance, NULL);
1187 		goto failed;
1188 	}
1189 	return TRUE;
1190 
1191 failed:
1192 	if (attr != NULL) {
1193 		kern_nexus_attr_destroy(attr);
1194 	}
1195 	return FALSE;
1196 }
1197 
1198 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1199 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1200 {
1201 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1202 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1203 		goto failed;
1204 	}
1205 	switch (ifp->if_type) {
1206 	case IFT_CELLULAR:
1207 	case IFT_ETHER:
1208 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1209 			/* don't auto-attach */
1210 			goto failed;
1211 		}
1212 		break;
1213 	default:
1214 		/* don't auto-attach */
1215 		goto failed;
1216 	}
1217 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1218 
1219 failed:
1220 	return FALSE;
1221 }
1222 
1223 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1224 dlil_is_native_netif_nexus(ifnet_t ifp)
1225 {
1226 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1227 }
1228 
1229 __attribute__((noinline))
1230 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1231 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1232 {
1233 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1234 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1235 }
1236 
1237 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1238 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1239 {
1240 	struct ifreq        ifr;
1241 	int                 error;
1242 
1243 	bzero(&ifr, sizeof(ifr));
1244 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1245 	if (error == 0) {
1246 		*ifdm_p = ifr.ifr_devmtu;
1247 	}
1248 	return error;
1249 }
1250 
1251 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1252 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1253 {
1254 #ifdef XNU_TARGET_OS_OSX
1255 	uint32_t tso_v4_mtu = 0;
1256 	uint32_t tso_v6_mtu = 0;
1257 
1258 	if (!dlil_is_native_netif_nexus(ifp)) {
1259 		return;
1260 	}
1261 	/*
1262 	 * Note that we are reading the real hwassist flags set by the driver
1263 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1264 	 * hasn't been called yet.
1265 	 */
1266 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1267 		tso_v4_mtu = ifp->if_tso_v4_mtu;
1268 	}
1269 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1270 		tso_v6_mtu = ifp->if_tso_v6_mtu;
1271 	}
1272 	/*
1273 	 * If the hardware supports TSO, adjust the large buf size to match the
1274 	 * supported TSO MTU size.
1275 	 */
1276 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1277 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1278 	} else {
1279 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1280 	}
1281 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1282 #else
1283 #pragma unused(ifp, large_buf_size)
1284 #endif /* XNU_TARGET_OS_OSX */
1285 }
1286 
1287 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1288 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1289     bool *use_multi_buflet, uint32_t *large_buf_size)
1290 {
1291 	struct kern_pbufpool_memory_info rx_pp_info;
1292 	struct kern_pbufpool_memory_info tx_pp_info;
1293 	uint32_t if_max_mtu = 0;
1294 	uint32_t drv_buf_size;
1295 	struct ifdevmtu ifdm;
1296 	int err;
1297 
1298 	/*
1299 	 * To perform intra-stack RX aggregation flowswitch needs to use
1300 	 * multi-buflet packet.
1301 	 */
1302 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1303 
1304 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1305 	/*
1306 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1307 	 * but the driver advertises the MAX MTU as only 9K.
1308 	 */
1309 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1310 		if_max_mtu = IP_MAXPACKET;
1311 		goto skip_mtu_ioctl;
1312 	}
1313 
1314 	/* determine max mtu */
1315 	bzero(&ifdm, sizeof(ifdm));
1316 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1317 	if (__improbable(err != 0)) {
1318 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1319 		    __func__, if_name(ifp));
1320 		/* use default flowswitch buffer size */
1321 		if_max_mtu = NX_FSW_BUFSIZE;
1322 	} else {
1323 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1324 		    ifdm.ifdm_max, ifdm.ifdm_current);
1325 		/* rdar://problem/44589731 */
1326 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1327 	}
1328 
1329 skip_mtu_ioctl:
1330 	if (if_max_mtu == 0) {
1331 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1332 		    __func__, if_name(ifp));
1333 		return EINVAL;
1334 	}
1335 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1336 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1337 		    "max bufsize(%d)\n", __func__,
1338 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1339 		return EINVAL;
1340 	}
1341 
1342 	/*
1343 	 * for skywalk native driver, consult the driver packet pool also.
1344 	 */
1345 	if (dlil_is_native_netif_nexus(ifp)) {
1346 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1347 		    &tx_pp_info);
1348 		if (err != 0) {
1349 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1350 			    __func__, if_name(ifp));
1351 			return ENXIO;
1352 		}
1353 		drv_buf_size = tx_pp_info.kpm_bufsize *
1354 		    tx_pp_info.kpm_max_frags;
1355 		if (if_max_mtu > drv_buf_size) {
1356 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1357 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1358 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1359 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1360 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1361 			return EINVAL;
1362 		}
1363 	} else {
1364 		drv_buf_size = if_max_mtu;
1365 	}
1366 
1367 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1368 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1369 		*use_multi_buflet = true;
1370 		/* default flowswitch buffer size */
1371 		*buf_size = NX_FSW_BUFSIZE;
1372 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1373 	} else {
1374 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1375 	}
1376 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1377 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1378 	if (*buf_size >= *large_buf_size) {
1379 		*large_buf_size = 0;
1380 	}
1381 	return 0;
1382 }
1383 
1384 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1385 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1386 {
1387 	nexus_attr_t            attr = NULL;
1388 	nexus_controller_t      controller;
1389 	errno_t                 err = 0;
1390 	uuid_t                  netif;
1391 	uint32_t                buf_size = 0;
1392 	uint32_t                large_buf_size = 0;
1393 	bool                    multi_buflet;
1394 
1395 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1396 	    IFNET_IS_VMNET(ifp)) {
1397 		goto failed;
1398 	}
1399 
1400 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1401 		/* not possible to attach (netif native/compat not plumbed) */
1402 		goto failed;
1403 	}
1404 
1405 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1406 		/* don't auto-attach */
1407 		goto failed;
1408 	}
1409 
1410 	/* get the netif instance from the ifp */
1411 	err = kern_nexus_get_netif_instance(ifp, netif);
1412 	if (err != 0) {
1413 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1414 		    if_name(ifp));
1415 		goto failed;
1416 	}
1417 
1418 	err = kern_nexus_attr_create(&attr);
1419 	if (err != 0) {
1420 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1421 		    if_name(ifp));
1422 		goto failed;
1423 	}
1424 
1425 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1426 	    &multi_buflet, &large_buf_size);
1427 	if (err != 0) {
1428 		goto failed;
1429 	}
1430 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1431 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1432 
1433 	/* Configure flowswitch buffer size */
1434 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1435 	VERIFY(err == 0);
1436 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1437 	    large_buf_size);
1438 	VERIFY(err == 0);
1439 
1440 	/*
1441 	 * Configure flowswitch to use super-packet (multi-buflet).
1442 	 */
1443 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1444 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1445 	VERIFY(err == 0);
1446 
1447 	/* create the flowswitch provider and instance */
1448 	controller = kern_nexus_shared_controller();
1449 	err = dlil_create_provider_and_instance(controller,
1450 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1451 	    &nexus_fsw->if_fsw_instance, attr);
1452 	if (err != 0) {
1453 		goto failed;
1454 	}
1455 
1456 	/* attach the device port */
1457 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1458 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1459 	if (err != 0) {
1460 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1461 		    __func__, err, if_name(ifp));
1462 		/* cleanup provider and instance */
1463 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1464 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1465 		goto failed;
1466 	}
1467 	return TRUE;
1468 
1469 failed:
1470 	if (err != 0) {
1471 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1472 		    __func__, if_name(ifp), err);
1473 	} else {
1474 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1475 		    __func__, if_name(ifp));
1476 	}
1477 	if (attr != NULL) {
1478 		kern_nexus_attr_destroy(attr);
1479 	}
1480 	return FALSE;
1481 }
1482 
1483 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1484 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1485 {
1486 	boolean_t               attached;
1487 	if_nexus_flowswitch     nexus_fsw;
1488 
1489 #if (DEVELOPMENT || DEBUG)
1490 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1491 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1492 		return FALSE;
1493 	}
1494 #endif /* (DEVELOPMENT || DEBUG) */
1495 
1496 	/*
1497 	 * flowswitch attachment is not supported for interface using the
1498 	 * legacy model (IFNET_INIT_LEGACY)
1499 	 */
1500 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1501 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1502 		    if_name(ifp));
1503 		return FALSE;
1504 	}
1505 
1506 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1507 		/* it's already attached */
1508 		return FALSE;
1509 	}
1510 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1511 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1512 	if (attached) {
1513 		ifnet_lock_exclusive(ifp);
1514 		if (!IF_FULLY_ATTACHED(ifp)) {
1515 			/* interface is going away */
1516 			attached = FALSE;
1517 		} else {
1518 			ifp->if_nx_flowswitch = nexus_fsw;
1519 		}
1520 		ifnet_lock_done(ifp);
1521 		if (!attached) {
1522 			/* clean up flowswitch nexus */
1523 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1524 		}
1525 	}
1526 	return attached;
1527 }
1528 
1529 __attribute__((noinline))
1530 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1531 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1532 {
1533 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1534 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1535 }
1536 
1537 __attribute__((noinline))
1538 static void
dlil_netif_detach_notify(ifnet_t ifp)1539 dlil_netif_detach_notify(ifnet_t ifp)
1540 {
1541 	ifnet_detach_notify_cb_t notify = NULL;
1542 	void *arg = NULL;
1543 
1544 	ifnet_get_detach_notify(ifp, &notify, &arg);
1545 	if (notify == NULL) {
1546 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1547 		return;
1548 	}
1549 	(*notify)(arg);
1550 }
1551 
1552 __attribute__((noinline))
1553 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1554 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1555 {
1556 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1557 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1558 
1559 	ifnet_datamov_suspend_and_drain(ifp);
1560 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1561 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1562 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1563 		dlil_detach_flowswitch_nexus(nx_fsw);
1564 		bzero(nx_fsw, sizeof(*nx_fsw));
1565 	} else {
1566 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1567 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1568 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1569 	}
1570 
1571 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1572 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1573 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1574 		dlil_detach_netif_nexus(nx_netif);
1575 		bzero(nx_netif, sizeof(*nx_netif));
1576 	} else {
1577 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1578 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1579 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1580 	}
1581 	ifnet_datamov_resume(ifp);
1582 }
1583 
1584 boolean_t
ifnet_add_netagent(ifnet_t ifp)1585 ifnet_add_netagent(ifnet_t ifp)
1586 {
1587 	int     error;
1588 
1589 	error = kern_nexus_interface_add_netagent(ifp);
1590 	os_log(OS_LOG_DEFAULT,
1591 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1592 	    ifp->if_xname, error);
1593 	return error == 0;
1594 }
1595 
1596 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1597 ifnet_remove_netagent(ifnet_t ifp)
1598 {
1599 	int     error;
1600 
1601 	error = kern_nexus_interface_remove_netagent(ifp);
1602 	os_log(OS_LOG_DEFAULT,
1603 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1604 	    ifp->if_xname, error);
1605 	return error == 0;
1606 }
1607 
1608 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1609 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1610 {
1611 	if (!IF_FULLY_ATTACHED(ifp)) {
1612 		return FALSE;
1613 	}
1614 	return dlil_attach_flowswitch_nexus(ifp);
1615 }
1616 
1617 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1618 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1619 {
1620 	if_nexus_flowswitch     nexus_fsw;
1621 
1622 	ifnet_lock_exclusive(ifp);
1623 	nexus_fsw = ifp->if_nx_flowswitch;
1624 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1625 	ifnet_lock_done(ifp);
1626 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1627 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1628 }
1629 
1630 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1631 ifnet_attach_netif_nexus(ifnet_t ifp)
1632 {
1633 	boolean_t       nexus_attached;
1634 	if_nexus_netif  nexus_netif;
1635 
1636 	if (!IF_FULLY_ATTACHED(ifp)) {
1637 		return FALSE;
1638 	}
1639 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1640 	if (nexus_attached) {
1641 		ifnet_lock_exclusive(ifp);
1642 		ifp->if_nx_netif = nexus_netif;
1643 		ifnet_lock_done(ifp);
1644 	}
1645 	return nexus_attached;
1646 }
1647 
1648 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1649 ifnet_detach_netif_nexus(ifnet_t ifp)
1650 {
1651 	if_nexus_netif  nexus_netif;
1652 
1653 	ifnet_lock_exclusive(ifp);
1654 	nexus_netif = ifp->if_nx_netif;
1655 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1656 	ifnet_lock_done(ifp);
1657 
1658 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1659 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1660 }
1661 
1662 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1663 ifnet_attach_native_flowswitch(ifnet_t ifp)
1664 {
1665 	if (!dlil_is_native_netif_nexus(ifp)) {
1666 		/* not a native netif */
1667 		return;
1668 	}
1669 	ifnet_attach_flowswitch_nexus(ifp);
1670 }
1671 
1672 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1673 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1674 {
1675 	lck_mtx_lock(&ifp->if_delegate_lock);
1676 	while (ifp->if_fsw_rx_cb_ref > 0) {
1677 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1678 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1679 		    (PZERO + 1), __FUNCTION__, NULL);
1680 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1681 	}
1682 	ifp->if_fsw_rx_cb = cb;
1683 	ifp->if_fsw_rx_cb_arg = arg;
1684 	lck_mtx_unlock(&ifp->if_delegate_lock);
1685 	return 0;
1686 }
1687 
1688 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1689 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1690 {
1691 	/*
1692 	 * This is for avoiding the unnecessary lock acquire for interfaces
1693 	 * not used by a redirect interface.
1694 	 */
1695 	if (ifp->if_fsw_rx_cb == NULL) {
1696 		return ENOENT;
1697 	}
1698 	lck_mtx_lock(&ifp->if_delegate_lock);
1699 	if (ifp->if_fsw_rx_cb == NULL) {
1700 		lck_mtx_unlock(&ifp->if_delegate_lock);
1701 		return ENOENT;
1702 	}
1703 	*cbp = ifp->if_fsw_rx_cb;
1704 	*argp = ifp->if_fsw_rx_cb_arg;
1705 	ifp->if_fsw_rx_cb_ref++;
1706 	lck_mtx_unlock(&ifp->if_delegate_lock);
1707 	return 0;
1708 }
1709 
1710 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1711 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1712 {
1713 	lck_mtx_lock(&ifp->if_delegate_lock);
1714 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1715 		wakeup(&ifp->if_fsw_rx_cb_ref);
1716 	}
1717 	lck_mtx_unlock(&ifp->if_delegate_lock);
1718 }
1719 
1720 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1721 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1722 {
1723 	lck_mtx_lock(&difp->if_delegate_lock);
1724 	while (difp->if_delegate_parent_ref > 0) {
1725 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1726 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1727 		    (PZERO + 1), __FUNCTION__, NULL);
1728 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1729 	}
1730 	difp->if_delegate_parent = parent;
1731 	lck_mtx_unlock(&difp->if_delegate_lock);
1732 	return 0;
1733 }
1734 
1735 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1736 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1737 {
1738 	lck_mtx_lock(&difp->if_delegate_lock);
1739 	if (difp->if_delegate_parent == NULL) {
1740 		lck_mtx_unlock(&difp->if_delegate_lock);
1741 		return ENOENT;
1742 	}
1743 	*parentp = difp->if_delegate_parent;
1744 	difp->if_delegate_parent_ref++;
1745 	lck_mtx_unlock(&difp->if_delegate_lock);
1746 	return 0;
1747 }
1748 
1749 void
ifnet_release_delegate_parent(ifnet_t difp)1750 ifnet_release_delegate_parent(ifnet_t difp)
1751 {
1752 	lck_mtx_lock(&difp->if_delegate_lock);
1753 	if (--difp->if_delegate_parent_ref == 0) {
1754 		wakeup(&difp->if_delegate_parent_ref);
1755 	}
1756 	lck_mtx_unlock(&difp->if_delegate_lock);
1757 }
1758 
1759 __attribute__((noinline))
1760 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1761 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1762 {
1763 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1764 	ifp->if_detach_notify = notify;
1765 	ifp->if_detach_notify_arg = arg;
1766 }
1767 
1768 __attribute__((noinline))
1769 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1770 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1771 {
1772 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1773 	*notifyp = ifp->if_detach_notify;
1774 	*argp = ifp->if_detach_notify_arg;
1775 }
1776 
1777 __attribute__((noinline))
1778 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1779 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1780 {
1781 	ifnet_lock_exclusive(ifp);
1782 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1783 	ifnet_lock_done(ifp);
1784 }
1785 
1786 __attribute__((noinline))
1787 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1788 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1789 {
1790 	ifnet_lock_exclusive(ifp);
1791 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1792 	ifnet_lock_done(ifp);
1793 }
1794 #endif /* SKYWALK */
1795 
1796 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1797 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1798 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1799 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1800 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1801 	/* NOTREACHED */                                        \
1802 	}                                                               \
1803 }
1804 
1805 #define DLIL_EWMA(old, new, decay) do {                                 \
1806 	u_int32_t _avg;                                                 \
1807 	if ((_avg = (old)) > 0)                                         \
1808 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1809 	else                                                            \
1810 	        _avg = (new);                                           \
1811 	(old) = _avg;                                                   \
1812 } while (0)
1813 
1814 #define MBPS    (1ULL * 1000 * 1000)
1815 #define GBPS    (MBPS * 1000)
1816 
1817 struct rxpoll_time_tbl {
1818 	u_int64_t       speed;          /* downlink speed */
1819 	u_int32_t       plowat;         /* packets low watermark */
1820 	u_int32_t       phiwat;         /* packets high watermark */
1821 	u_int32_t       blowat;         /* bytes low watermark */
1822 	u_int32_t       bhiwat;         /* bytes high watermark */
1823 };
1824 
1825 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1826 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1827 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1828 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1829 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1830 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1831 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1832 };
1833 
1834 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1835     &dlil_lck_attributes);
1836 static uint32_t dlil_pending_thread_cnt = 0;
1837 
1838 static void
dlil_incr_pending_thread_count(void)1839 dlil_incr_pending_thread_count(void)
1840 {
1841 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1842 	lck_mtx_lock(&dlil_thread_sync_lock);
1843 	dlil_pending_thread_cnt++;
1844 	lck_mtx_unlock(&dlil_thread_sync_lock);
1845 }
1846 
1847 static void
dlil_decr_pending_thread_count(void)1848 dlil_decr_pending_thread_count(void)
1849 {
1850 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1851 	lck_mtx_lock(&dlil_thread_sync_lock);
1852 	VERIFY(dlil_pending_thread_cnt > 0);
1853 	dlil_pending_thread_cnt--;
1854 	if (dlil_pending_thread_cnt == 0) {
1855 		wakeup(&dlil_pending_thread_cnt);
1856 	}
1857 	lck_mtx_unlock(&dlil_thread_sync_lock);
1858 }
1859 
1860 int
proto_hash_value(u_int32_t protocol_family)1861 proto_hash_value(u_int32_t protocol_family)
1862 {
1863 	/*
1864 	 * dlil_proto_unplumb_all() depends on the mapping between
1865 	 * the hash bucket index and the protocol family defined
1866 	 * here; future changes must be applied there as well.
1867 	 */
1868 	switch (protocol_family) {
1869 	case PF_INET:
1870 		return 0;
1871 	case PF_INET6:
1872 		return 1;
1873 	case PF_VLAN:
1874 		return 2;
1875 	case PF_UNSPEC:
1876 	default:
1877 		return 3;
1878 	}
1879 }
1880 
1881 /*
1882  * Caller must already be holding ifnet lock.
1883  */
1884 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1885 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1886 {
1887 	struct if_proto *proto = NULL;
1888 	u_int32_t i = proto_hash_value(protocol_family);
1889 
1890 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1891 
1892 	if (ifp->if_proto_hash != NULL) {
1893 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1894 	}
1895 
1896 	while (proto != NULL && proto->protocol_family != protocol_family) {
1897 		proto = SLIST_NEXT(proto, next_hash);
1898 	}
1899 
1900 	if (proto != NULL) {
1901 		if_proto_ref(proto);
1902 	}
1903 
1904 	return proto;
1905 }
1906 
1907 static void
if_proto_ref(struct if_proto * proto)1908 if_proto_ref(struct if_proto *proto)
1909 {
1910 	os_atomic_inc(&proto->refcount, relaxed);
1911 }
1912 
1913 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1914 
1915 static void
if_proto_free(struct if_proto * proto)1916 if_proto_free(struct if_proto *proto)
1917 {
1918 	u_int32_t oldval;
1919 	struct ifnet *ifp = proto->ifp;
1920 	u_int32_t proto_family = proto->protocol_family;
1921 	struct kev_dl_proto_data ev_pr_data;
1922 
1923 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1924 	if (oldval > 1) {
1925 		return;
1926 	}
1927 
1928 	if (proto->proto_kpi == kProtoKPI_v1) {
1929 		if (proto->kpi.v1.detached) {
1930 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1931 		}
1932 	}
1933 	if (proto->proto_kpi == kProtoKPI_v2) {
1934 		if (proto->kpi.v2.detached) {
1935 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1936 		}
1937 	}
1938 
1939 	/*
1940 	 * Cleanup routes that may still be in the routing table for that
1941 	 * interface/protocol pair.
1942 	 */
1943 	if_rtproto_del(ifp, proto_family);
1944 
1945 	ifnet_lock_shared(ifp);
1946 
1947 	/* No more reference on this, protocol must have been detached */
1948 	VERIFY(proto->detached);
1949 
1950 	/*
1951 	 * The reserved field carries the number of protocol still attached
1952 	 * (subject to change)
1953 	 */
1954 	ev_pr_data.proto_family = proto_family;
1955 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1956 
1957 	ifnet_lock_done(ifp);
1958 
1959 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1960 	    (struct net_event_data *)&ev_pr_data,
1961 	    sizeof(struct kev_dl_proto_data), FALSE);
1962 
1963 	if (ev_pr_data.proto_remaining_count == 0) {
1964 		/*
1965 		 * The protocol count has gone to zero, mark the interface down.
1966 		 * This used to be done by configd.KernelEventMonitor, but that
1967 		 * is inherently prone to races (rdar://problem/30810208).
1968 		 */
1969 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1970 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1971 		dlil_post_sifflags_msg(ifp);
1972 	}
1973 
1974 	zfree(dlif_proto_zone, proto);
1975 }
1976 
1977 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1978 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1979 {
1980 #if !MACH_ASSERT
1981 #pragma unused(ifp)
1982 #endif
1983 	unsigned int type = 0;
1984 	int ass = 1;
1985 
1986 	switch (what) {
1987 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1988 		type = LCK_RW_ASSERT_EXCLUSIVE;
1989 		break;
1990 
1991 	case IFNET_LCK_ASSERT_SHARED:
1992 		type = LCK_RW_ASSERT_SHARED;
1993 		break;
1994 
1995 	case IFNET_LCK_ASSERT_OWNED:
1996 		type = LCK_RW_ASSERT_HELD;
1997 		break;
1998 
1999 	case IFNET_LCK_ASSERT_NOTOWNED:
2000 		/* nothing to do here for RW lock; bypass assert */
2001 		ass = 0;
2002 		break;
2003 
2004 	default:
2005 		panic("bad ifnet assert type: %d", what);
2006 		/* NOTREACHED */
2007 	}
2008 	if (ass) {
2009 		LCK_RW_ASSERT(&ifp->if_lock, type);
2010 	}
2011 }
2012 
2013 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2014 ifnet_lock_shared(struct ifnet *ifp)
2015 {
2016 	lck_rw_lock_shared(&ifp->if_lock);
2017 }
2018 
2019 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2020 ifnet_lock_exclusive(struct ifnet *ifp)
2021 {
2022 	lck_rw_lock_exclusive(&ifp->if_lock);
2023 }
2024 
2025 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2026 ifnet_lock_done(struct ifnet *ifp)
2027 {
2028 	lck_rw_done(&ifp->if_lock);
2029 }
2030 
2031 #if INET
2032 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2033 if_inetdata_lock_shared(struct ifnet *ifp)
2034 {
2035 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
2036 }
2037 
2038 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2039 if_inetdata_lock_exclusive(struct ifnet *ifp)
2040 {
2041 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2042 }
2043 
2044 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2045 if_inetdata_lock_done(struct ifnet *ifp)
2046 {
2047 	lck_rw_done(&ifp->if_inetdata_lock);
2048 }
2049 #endif
2050 
2051 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2052 if_inet6data_lock_shared(struct ifnet *ifp)
2053 {
2054 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
2055 }
2056 
2057 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2058 if_inet6data_lock_exclusive(struct ifnet *ifp)
2059 {
2060 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2061 }
2062 
2063 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2064 if_inet6data_lock_done(struct ifnet *ifp)
2065 {
2066 	lck_rw_done(&ifp->if_inet6data_lock);
2067 }
2068 
2069 __private_extern__ void
ifnet_head_lock_shared(void)2070 ifnet_head_lock_shared(void)
2071 {
2072 	lck_rw_lock_shared(&ifnet_head_lock);
2073 }
2074 
2075 __private_extern__ void
ifnet_head_lock_exclusive(void)2076 ifnet_head_lock_exclusive(void)
2077 {
2078 	lck_rw_lock_exclusive(&ifnet_head_lock);
2079 }
2080 
2081 __private_extern__ void
ifnet_head_done(void)2082 ifnet_head_done(void)
2083 {
2084 	lck_rw_done(&ifnet_head_lock);
2085 }
2086 
2087 __private_extern__ void
ifnet_head_assert_exclusive(void)2088 ifnet_head_assert_exclusive(void)
2089 {
2090 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2091 }
2092 
2093 /*
2094  * dlil_ifp_protolist
2095  * - get the list of protocols attached to the interface, or just the number
2096  *   of attached protocols
2097  * - if the number returned is greater than 'list_count', truncation occurred
2098  *
2099  * Note:
2100  * - caller must already be holding ifnet lock.
2101  */
2102 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2103 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2104     u_int32_t list_count)
2105 {
2106 	u_int32_t       count = 0;
2107 	int             i;
2108 
2109 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2110 
2111 	if (ifp->if_proto_hash == NULL) {
2112 		goto done;
2113 	}
2114 
2115 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2116 		struct if_proto *proto;
2117 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2118 			if (list != NULL && count < list_count) {
2119 				list[count] = proto->protocol_family;
2120 			}
2121 			count++;
2122 		}
2123 	}
2124 done:
2125 	return count;
2126 }
2127 
2128 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2129 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2130 {
2131 	ifnet_lock_shared(ifp);
2132 	count = dlil_ifp_protolist(ifp, protolist, count);
2133 	ifnet_lock_done(ifp);
2134 	return count;
2135 }
2136 
2137 __private_extern__ void
if_free_protolist(u_int32_t * list)2138 if_free_protolist(u_int32_t *list)
2139 {
2140 	kfree_data_addr(list);
2141 }
2142 
2143 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2144 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2145     u_int32_t event_code, struct net_event_data *event_data,
2146     u_int32_t event_data_len, boolean_t suppress_generation)
2147 {
2148 	struct net_event_data ev_data;
2149 	struct kev_msg ev_msg;
2150 
2151 	bzero(&ev_msg, sizeof(ev_msg));
2152 	bzero(&ev_data, sizeof(ev_data));
2153 	/*
2154 	 * a net event always starts with a net_event_data structure
2155 	 * but the caller can generate a simple net event or
2156 	 * provide a longer event structure to post
2157 	 */
2158 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
2159 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
2160 	ev_msg.kev_subclass     = event_subclass;
2161 	ev_msg.event_code       = event_code;
2162 
2163 	if (event_data == NULL) {
2164 		event_data = &ev_data;
2165 		event_data_len = sizeof(struct net_event_data);
2166 	}
2167 
2168 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2169 	event_data->if_family = ifp->if_family;
2170 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2171 
2172 	ev_msg.dv[0].data_length = event_data_len;
2173 	ev_msg.dv[0].data_ptr    = event_data;
2174 	ev_msg.dv[1].data_length = 0;
2175 
2176 	bool update_generation = true;
2177 	if (event_subclass == KEV_DL_SUBCLASS) {
2178 		/* Don't update interface generation for frequent link quality and state changes  */
2179 		switch (event_code) {
2180 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2181 		case KEV_DL_RRC_STATE_CHANGED:
2182 		case KEV_DL_PRIMARY_ELECTED:
2183 			update_generation = false;
2184 			break;
2185 		default:
2186 			break;
2187 		}
2188 	}
2189 
2190 	/*
2191 	 * Some events that update generation counts might
2192 	 * want to suppress generation count.
2193 	 * One example is node presence/absence where we still
2194 	 * issue kernel event for the invocation but want to avoid
2195 	 * expensive operation of updating generation which triggers
2196 	 * NECP client updates.
2197 	 */
2198 	if (suppress_generation) {
2199 		update_generation = false;
2200 	}
2201 
2202 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2203 }
2204 
2205 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2206 dlil_alloc_local_stats(struct ifnet *ifp)
2207 {
2208 	int ret = EINVAL;
2209 	void *buf, *base, **pbuf;
2210 
2211 	if (ifp == NULL) {
2212 		goto end;
2213 	}
2214 
2215 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2216 		/* allocate tcpstat_local structure */
2217 		buf = zalloc_flags(dlif_tcpstat_zone,
2218 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2219 
2220 		/* Get the 64-bit aligned base address for this object */
2221 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2222 		    sizeof(u_int64_t));
2223 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2224 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2225 
2226 		/*
2227 		 * Wind back a pointer size from the aligned base and
2228 		 * save the original address so we can free it later.
2229 		 */
2230 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2231 		*pbuf = buf;
2232 		ifp->if_tcp_stat = base;
2233 
2234 		/* allocate udpstat_local structure */
2235 		buf = zalloc_flags(dlif_udpstat_zone,
2236 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2237 
2238 		/* Get the 64-bit aligned base address for this object */
2239 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2240 		    sizeof(u_int64_t));
2241 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2242 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2243 
2244 		/*
2245 		 * Wind back a pointer size from the aligned base and
2246 		 * save the original address so we can free it later.
2247 		 */
2248 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2249 		*pbuf = buf;
2250 		ifp->if_udp_stat = base;
2251 
2252 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2253 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2254 
2255 		ret = 0;
2256 	}
2257 
2258 	if (ifp->if_ipv4_stat == NULL) {
2259 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2260 	}
2261 
2262 	if (ifp->if_ipv6_stat == NULL) {
2263 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2264 	}
2265 end:
2266 	if (ifp != NULL && ret != 0) {
2267 		if (ifp->if_tcp_stat != NULL) {
2268 			pbuf = (void **)
2269 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2270 			zfree(dlif_tcpstat_zone, *pbuf);
2271 			ifp->if_tcp_stat = NULL;
2272 		}
2273 		if (ifp->if_udp_stat != NULL) {
2274 			pbuf = (void **)
2275 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2276 			zfree(dlif_udpstat_zone, *pbuf);
2277 			ifp->if_udp_stat = NULL;
2278 		}
2279 		/* The macro kfree_type sets the passed pointer to NULL */
2280 		if (ifp->if_ipv4_stat != NULL) {
2281 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2282 		}
2283 		if (ifp->if_ipv6_stat != NULL) {
2284 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2285 		}
2286 	}
2287 
2288 	return ret;
2289 }
2290 
2291 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2292 dlil_reset_rxpoll_params(ifnet_t ifp)
2293 {
2294 	ASSERT(ifp != NULL);
2295 	ifnet_set_poll_cycle(ifp, NULL);
2296 	ifp->if_poll_update = 0;
2297 	ifp->if_poll_flags = 0;
2298 	ifp->if_poll_req = 0;
2299 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2300 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2301 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2302 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2303 	net_timerclear(&ifp->if_poll_mode_holdtime);
2304 	net_timerclear(&ifp->if_poll_mode_lasttime);
2305 	net_timerclear(&ifp->if_poll_sample_holdtime);
2306 	net_timerclear(&ifp->if_poll_sample_lasttime);
2307 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2308 }
2309 
2310 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2311 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2312     thread_continue_t *thfunc)
2313 {
2314 	boolean_t dlil_rxpoll_input;
2315 	thread_continue_t func = NULL;
2316 	u_int32_t limit;
2317 	int error = 0;
2318 
2319 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2320 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2321 
2322 	/* default strategy utilizes the DLIL worker thread */
2323 	inp->dlth_strategy = dlil_input_async;
2324 
2325 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2326 	if (ifp == NULL) {
2327 		/*
2328 		 * Main input thread only.
2329 		 */
2330 		func = dlil_main_input_thread_func;
2331 		VERIFY(inp == dlil_main_input_thread);
2332 		(void) strlcat(inp->dlth_name,
2333 		    "main_input", DLIL_THREADNAME_LEN);
2334 	} else if (dlil_rxpoll_input) {
2335 		/*
2336 		 * Legacy (non-netif) hybrid polling.
2337 		 */
2338 		func = dlil_rxpoll_input_thread_func;
2339 		VERIFY(inp != dlil_main_input_thread);
2340 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2341 		    "%s_input_poll", if_name(ifp));
2342 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2343 		/*
2344 		 * Asynchronous strategy.
2345 		 */
2346 		func = dlil_input_thread_func;
2347 		VERIFY(inp != dlil_main_input_thread);
2348 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2349 		    "%s_input", if_name(ifp));
2350 	} else {
2351 		/*
2352 		 * Synchronous strategy if there's a netif below and
2353 		 * the device isn't capable of hybrid polling.
2354 		 */
2355 		ASSERT(func == NULL);
2356 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2357 		VERIFY(inp != dlil_main_input_thread);
2358 		ASSERT(!inp->dlth_affinity);
2359 		inp->dlth_strategy = dlil_input_sync;
2360 	}
2361 	VERIFY(inp->dlth_thread == THREAD_NULL);
2362 
2363 	/* let caller know */
2364 	if (thfunc != NULL) {
2365 		*thfunc = func;
2366 	}
2367 
2368 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2369 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2370 
2371 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2372 
2373 	/*
2374 	 * For interfaces that support opportunistic polling, set the
2375 	 * low and high watermarks for outstanding inbound packets/bytes.
2376 	 * Also define freeze times for transitioning between modes
2377 	 * and updating the average.
2378 	 */
2379 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2380 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2381 		if (ifp->if_xflags & IFXF_LEGACY) {
2382 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2383 		}
2384 	} else {
2385 		/*
2386 		 * For interfaces that don't support opportunistic
2387 		 * polling, set the burst limit to prevent memory exhaustion.
2388 		 * The values of `if_rcvq_burst_limit' are safeguarded
2389 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2390 		 */
2391 		limit = if_rcvq_burst_limit;
2392 	}
2393 
2394 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2395 	if (inp == dlil_main_input_thread) {
2396 		struct dlil_main_threading_info *inpm =
2397 		    (struct dlil_main_threading_info *)inp;
2398 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2399 	}
2400 
2401 	if (func == NULL) {
2402 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2403 		ASSERT(error == 0);
2404 		error = ENODEV;
2405 		goto done;
2406 	}
2407 
2408 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2409 	if (error == KERN_SUCCESS) {
2410 		thread_precedence_policy_data_t info;
2411 		__unused kern_return_t kret;
2412 
2413 		bzero(&info, sizeof(info));
2414 		info.importance = 0;
2415 		kret = thread_policy_set(inp->dlth_thread,
2416 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2417 		    THREAD_PRECEDENCE_POLICY_COUNT);
2418 		ASSERT(kret == KERN_SUCCESS);
2419 		/*
2420 		 * We create an affinity set so that the matching workloop
2421 		 * thread or the starter thread (for loopback) can be
2422 		 * scheduled on the same processor set as the input thread.
2423 		 */
2424 		if (net_affinity) {
2425 			struct thread *tp = inp->dlth_thread;
2426 			u_int32_t tag;
2427 			/*
2428 			 * Randomize to reduce the probability
2429 			 * of affinity tag namespace collision.
2430 			 */
2431 			read_frandom(&tag, sizeof(tag));
2432 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2433 				thread_reference(tp);
2434 				inp->dlth_affinity_tag = tag;
2435 				inp->dlth_affinity = TRUE;
2436 			}
2437 		}
2438 	} else if (inp == dlil_main_input_thread) {
2439 		panic_plain("%s: couldn't create main input thread", __func__);
2440 		/* NOTREACHED */
2441 	} else {
2442 		panic_plain("%s: couldn't create %s input thread", __func__,
2443 		    if_name(ifp));
2444 		/* NOTREACHED */
2445 	}
2446 	OSAddAtomic(1, &cur_dlil_input_threads);
2447 
2448 done:
2449 	return error;
2450 }
2451 
2452 #if TEST_INPUT_THREAD_TERMINATION
2453 static int
2454 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2455 {
2456 #pragma unused(arg1, arg2)
2457 	uint32_t i;
2458 	int err;
2459 
2460 	i = if_input_thread_termination_spin;
2461 
2462 	err = sysctl_handle_int(oidp, &i, 0, req);
2463 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2464 		return err;
2465 	}
2466 
2467 	if (net_rxpoll == 0) {
2468 		return ENXIO;
2469 	}
2470 
2471 	if_input_thread_termination_spin = i;
2472 	return err;
2473 }
2474 #endif /* TEST_INPUT_THREAD_TERMINATION */
2475 
2476 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2477 dlil_clean_threading_info(struct dlil_threading_info *inp)
2478 {
2479 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2480 	lck_grp_free(inp->dlth_lock_grp);
2481 	inp->dlth_lock_grp = NULL;
2482 
2483 	inp->dlth_flags = 0;
2484 	inp->dlth_wtot = 0;
2485 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2486 	inp->dlth_ifp = NULL;
2487 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2488 	qlimit(&inp->dlth_pkts) = 0;
2489 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2490 
2491 	VERIFY(!inp->dlth_affinity);
2492 	inp->dlth_thread = THREAD_NULL;
2493 	inp->dlth_strategy = NULL;
2494 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2495 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2496 	VERIFY(inp->dlth_affinity_tag == 0);
2497 #if IFNET_INPUT_SANITY_CHK
2498 	inp->dlth_pkts_cnt = 0;
2499 #endif /* IFNET_INPUT_SANITY_CHK */
2500 }
2501 
2502 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2503 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2504 {
2505 	struct ifnet *ifp = inp->dlth_ifp;
2506 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2507 
2508 	VERIFY(current_thread() == inp->dlth_thread);
2509 	VERIFY(inp != dlil_main_input_thread);
2510 
2511 	OSAddAtomic(-1, &cur_dlil_input_threads);
2512 
2513 #if TEST_INPUT_THREAD_TERMINATION
2514 	{ /* do something useless that won't get optimized away */
2515 		uint32_t        v = 1;
2516 		for (uint32_t i = 0;
2517 		    i < if_input_thread_termination_spin;
2518 		    i++) {
2519 			v = (i + 1) * v;
2520 		}
2521 		DLIL_PRINTF("the value is %d\n", v);
2522 	}
2523 #endif /* TEST_INPUT_THREAD_TERMINATION */
2524 
2525 	lck_mtx_lock_spin(&inp->dlth_lock);
2526 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2527 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2528 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2529 	wakeup_one((caddr_t)&inp->dlth_flags);
2530 	lck_mtx_unlock(&inp->dlth_lock);
2531 
2532 	/* free up pending packets */
2533 	if (pkt.cp_mbuf != NULL) {
2534 		mbuf_freem_list(pkt.cp_mbuf);
2535 	}
2536 
2537 	/* for the extra refcnt from kernel_thread_start() */
2538 	thread_deallocate(current_thread());
2539 
2540 	if (dlil_verbose) {
2541 		DLIL_PRINTF("%s: input thread terminated\n",
2542 		    if_name(ifp));
2543 	}
2544 
2545 	/* this is the end */
2546 	thread_terminate(current_thread());
2547 	/* NOTREACHED */
2548 }
2549 
2550 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2551 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2552 {
2553 	thread_affinity_policy_data_t policy;
2554 
2555 	bzero(&policy, sizeof(policy));
2556 	policy.affinity_tag = tag;
2557 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2558 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2559 }
2560 
2561 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2562 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2563 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2564     enum net_filter_event_subsystems state)
2565 {
2566 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2567 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2568 		if_enable_fsw_transport_netagent = 1;
2569 	} else {
2570 		if_enable_fsw_transport_netagent = 0;
2571 	}
2572 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2573 		kern_nexus_update_netagents();
2574 	} else if (!if_enable_fsw_transport_netagent) {
2575 		necp_update_all_clients();
2576 	}
2577 }
2578 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2579 
2580 void
dlil_init(void)2581 dlil_init(void)
2582 {
2583 	thread_t thread = THREAD_NULL;
2584 
2585 	/*
2586 	 * The following fields must be 64-bit aligned for atomic operations.
2587 	 */
2588 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2589 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2590 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2591 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2592 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2593 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2594 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2595 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2596 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2597 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2598 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2599 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2600 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2601 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2602 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2603 
2604 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2605 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2606 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2607 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2608 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2609 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2610 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2611 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2612 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2613 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2614 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2615 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2616 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2617 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2618 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2619 
2620 	/*
2621 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2622 	 */
2623 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2624 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2625 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2626 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2627 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2628 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2629 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2630 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2631 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2632 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2633 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2634 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2635 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2636 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2637 
2638 	/*
2639 	 * ... as well as the mbuf checksum flags counterparts.
2640 	 */
2641 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2642 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2643 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2644 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2645 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2646 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2647 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2648 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2649 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2650 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2651 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2652 
2653 	/*
2654 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2655 	 */
2656 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2657 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2658 
2659 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2660 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2661 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2662 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2663 
2664 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2665 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2666 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2667 
2668 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2669 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2670 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2671 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2672 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2673 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2674 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2675 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2676 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2677 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2678 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2679 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2680 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2681 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2682 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2683 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2684 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2685 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2686 
2687 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2688 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2689 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2690 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2691 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2692 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2693 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2694 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2695 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2696 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2697 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2698 
2699 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2700 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2701 
2702 	PE_parse_boot_argn("net_affinity", &net_affinity,
2703 	    sizeof(net_affinity));
2704 
2705 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2706 
2707 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2708 
2709 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2710 
2711 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2712 
2713 	VERIFY(dlil_pending_thread_cnt == 0);
2714 #if SKYWALK
2715 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2716 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2717 	boolean_t enable_fsw_netagent =
2718 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2719 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2720 
2721 	/*
2722 	 * Check the device tree to see if Skywalk netagent has been explicitly
2723 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2724 	 * Note that the property is a 0-length key, and so checking for the
2725 	 * presence itself is enough (no need to check for the actual value of
2726 	 * the retrieved variable.)
2727 	 */
2728 	pe_enable_fsw_transport_netagent =
2729 	    PE_get_default("kern.skywalk_netagent_enable",
2730 	    &pe_enable_fsw_transport_netagent,
2731 	    sizeof(pe_enable_fsw_transport_netagent));
2732 	pe_disable_fsw_transport_netagent =
2733 	    PE_get_default("kern.skywalk_netagent_disable",
2734 	    &pe_disable_fsw_transport_netagent,
2735 	    sizeof(pe_disable_fsw_transport_netagent));
2736 
2737 	/*
2738 	 * These two are mutually exclusive, i.e. they both can be absent,
2739 	 * but only one can be present at a time, and so we assert to make
2740 	 * sure it is correct.
2741 	 */
2742 	VERIFY((!pe_enable_fsw_transport_netagent &&
2743 	    !pe_disable_fsw_transport_netagent) ||
2744 	    (pe_enable_fsw_transport_netagent ^
2745 	    pe_disable_fsw_transport_netagent));
2746 
2747 	if (pe_enable_fsw_transport_netagent) {
2748 		kprintf("SK: netagent is enabled via an override for "
2749 		    "this platform\n");
2750 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2751 	} else if (pe_disable_fsw_transport_netagent) {
2752 		kprintf("SK: netagent is disabled via an override for "
2753 		    "this platform\n");
2754 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2755 	} else {
2756 		kprintf("SK: netagent is %s by default for this platform\n",
2757 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2758 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2759 	}
2760 
2761 	/*
2762 	 * Now see if there's a boot-arg override.
2763 	 */
2764 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2765 	    sizeof(if_attach_nx));
2766 	if_enable_fsw_transport_netagent =
2767 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2768 
2769 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2770 
2771 	if (pe_disable_fsw_transport_netagent &&
2772 	    if_enable_fsw_transport_netagent) {
2773 		kprintf("SK: netagent is force-enabled\n");
2774 	} else if (!pe_disable_fsw_transport_netagent &&
2775 	    !if_enable_fsw_transport_netagent) {
2776 		kprintf("SK: netagent is force-disabled\n");
2777 	}
2778 #ifdef XNU_TARGET_OS_OSX
2779 	if (if_enable_fsw_transport_netagent) {
2780 		net_filter_event_register(dlil_filter_event);
2781 	}
2782 #endif /* XNU_TARGET_OS_OSX */
2783 
2784 #if (DEVELOPMENT || DEBUG)
2785 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2786 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2787 #endif /* (DEVELOPMENT || DEBUG) */
2788 
2789 #endif /* SKYWALK */
2790 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2791 	    sizeof(struct dlil_ifnet_dbg);
2792 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2793 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2794 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2795 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2796 
2797 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2798 	/* Enforce 64-bit alignment for tcpstat_local structure */
2799 	dlif_tcpstat_bufsize =
2800 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2801 	dlif_tcpstat_bufsize = (uint32_t)
2802 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2803 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2804 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2805 
2806 	dlif_udpstat_size = sizeof(struct udpstat_local);
2807 	/* Enforce 64-bit alignment for udpstat_local structure */
2808 	dlif_udpstat_bufsize =
2809 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2810 	dlif_udpstat_bufsize = (uint32_t)
2811 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2812 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2813 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2814 
2815 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2816 
2817 	TAILQ_INIT(&dlil_ifnet_head);
2818 	TAILQ_INIT(&ifnet_head);
2819 	TAILQ_INIT(&ifnet_detaching_head);
2820 	TAILQ_INIT(&ifnet_ordered_head);
2821 
2822 	/* Initialize interface address subsystem */
2823 	ifa_init();
2824 
2825 #if PF
2826 	/* Initialize the packet filter */
2827 	pfinit();
2828 #endif /* PF */
2829 
2830 	/* Initialize queue algorithms */
2831 	classq_init();
2832 
2833 	/* Initialize packet schedulers */
2834 	pktsched_init();
2835 
2836 	/* Initialize flow advisory subsystem */
2837 	flowadv_init();
2838 
2839 	/* Initialize the pktap virtual interface */
2840 	pktap_init();
2841 
2842 	/* Initialize the service class to dscp map */
2843 	net_qos_map_init();
2844 
2845 	/* Initialize the interface low power mode event handler */
2846 	if_low_power_evhdlr_init();
2847 
2848 	/* Initialize the interface offload port list subsystem */
2849 	if_ports_used_init();
2850 
2851 #if DEBUG || DEVELOPMENT
2852 	/* Run self-tests */
2853 	dlil_verify_sum16();
2854 #endif /* DEBUG || DEVELOPMENT */
2855 
2856 	/*
2857 	 * Create and start up the main DLIL input thread and the interface
2858 	 * detacher threads once everything is initialized.
2859 	 */
2860 	dlil_incr_pending_thread_count();
2861 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2862 
2863 	/*
2864 	 * Create ifnet detacher thread.
2865 	 * When an interface gets detached, part of the detach processing
2866 	 * is delayed. The interface is added to delayed detach list
2867 	 * and this thread is woken up to call ifnet_detach_final
2868 	 * on these interfaces.
2869 	 */
2870 	dlil_incr_pending_thread_count();
2871 	if (kernel_thread_start(ifnet_detacher_thread_func,
2872 	    NULL, &thread) != KERN_SUCCESS) {
2873 		panic_plain("%s: couldn't create detacher thread", __func__);
2874 		/* NOTREACHED */
2875 	}
2876 	thread_deallocate(thread);
2877 
2878 	/*
2879 	 * Wait for the created kernel threads for dlil to get
2880 	 * scheduled and run at least once before we proceed
2881 	 */
2882 	lck_mtx_lock(&dlil_thread_sync_lock);
2883 	while (dlil_pending_thread_cnt != 0) {
2884 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2885 		    "threads to get scheduled at least once.\n", __func__);
2886 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2887 		    (PZERO - 1), __func__, NULL);
2888 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2889 	}
2890 	lck_mtx_unlock(&dlil_thread_sync_lock);
2891 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2892 	    "scheduled at least once. Proceeding.\n", __func__);
2893 }
2894 
2895 static void
if_flt_monitor_busy(struct ifnet * ifp)2896 if_flt_monitor_busy(struct ifnet *ifp)
2897 {
2898 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2899 
2900 	++ifp->if_flt_busy;
2901 	VERIFY(ifp->if_flt_busy != 0);
2902 }
2903 
2904 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2905 if_flt_monitor_unbusy(struct ifnet *ifp)
2906 {
2907 	if_flt_monitor_leave(ifp);
2908 }
2909 
2910 static void
if_flt_monitor_enter(struct ifnet * ifp)2911 if_flt_monitor_enter(struct ifnet *ifp)
2912 {
2913 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2914 
2915 	while (ifp->if_flt_busy) {
2916 		++ifp->if_flt_waiters;
2917 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2918 		    (PZERO - 1), "if_flt_monitor", NULL);
2919 	}
2920 	if_flt_monitor_busy(ifp);
2921 }
2922 
2923 static void
if_flt_monitor_leave(struct ifnet * ifp)2924 if_flt_monitor_leave(struct ifnet *ifp)
2925 {
2926 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2927 
2928 	VERIFY(ifp->if_flt_busy != 0);
2929 	--ifp->if_flt_busy;
2930 
2931 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2932 		ifp->if_flt_waiters = 0;
2933 		wakeup(&ifp->if_flt_head);
2934 	}
2935 }
2936 
2937 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2938 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2939     interface_filter_t *filter_ref, u_int32_t flags)
2940 {
2941 	int retval = 0;
2942 	struct ifnet_filter *filter = NULL;
2943 
2944 	ifnet_head_lock_shared();
2945 
2946 	/* Check that the interface is in the global list */
2947 	if (!ifnet_lookup(ifp)) {
2948 		retval = ENXIO;
2949 		goto done;
2950 	}
2951 	if (!ifnet_is_attached(ifp, 1)) {
2952 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2953 		    __func__, if_name(ifp));
2954 		retval = ENXIO;
2955 		goto done;
2956 	}
2957 
2958 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2959 
2960 	/* refcnt held above during lookup */
2961 	filter->filt_flags = flags;
2962 	filter->filt_ifp = ifp;
2963 	filter->filt_cookie = if_filter->iff_cookie;
2964 	filter->filt_name = if_filter->iff_name;
2965 	filter->filt_protocol = if_filter->iff_protocol;
2966 	/*
2967 	 * Do not install filter callbacks for internal coproc interface
2968 	 * and for management interfaces
2969 	 */
2970 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2971 		filter->filt_input = if_filter->iff_input;
2972 		filter->filt_output = if_filter->iff_output;
2973 		filter->filt_event = if_filter->iff_event;
2974 		filter->filt_ioctl = if_filter->iff_ioctl;
2975 	}
2976 	filter->filt_detached = if_filter->iff_detached;
2977 
2978 	lck_mtx_lock(&ifp->if_flt_lock);
2979 	if_flt_monitor_enter(ifp);
2980 
2981 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2982 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2983 
2984 	*filter_ref = filter;
2985 
2986 	/*
2987 	 * Bump filter count and route_generation ID to let TCP
2988 	 * know it shouldn't do TSO on this connection
2989 	 */
2990 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2991 		ifnet_filter_update_tso(ifp, TRUE);
2992 	}
2993 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2994 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2995 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2996 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2997 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2998 	} else {
2999 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
3000 	}
3001 	if_flt_monitor_leave(ifp);
3002 	lck_mtx_unlock(&ifp->if_flt_lock);
3003 
3004 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3005 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3006 	    net_check_compatible_if_filter(NULL));
3007 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3008 
3009 	if (dlil_verbose) {
3010 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3011 		    if_filter->iff_name);
3012 	}
3013 	ifnet_decr_iorefcnt(ifp);
3014 
3015 done:
3016 	ifnet_head_done();
3017 	if (retval != 0 && ifp != NULL) {
3018 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3019 		    if_name(ifp), if_filter->iff_name, retval);
3020 	}
3021 	if (retval != 0 && filter != NULL) {
3022 		zfree(dlif_filt_zone, filter);
3023 	}
3024 
3025 	return retval;
3026 }
3027 
3028 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3029 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
3030 {
3031 	int retval = 0;
3032 
3033 	if (detached == 0) {
3034 		ifnet_t ifp = NULL;
3035 
3036 		ifnet_head_lock_shared();
3037 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3038 			interface_filter_t entry = NULL;
3039 
3040 			lck_mtx_lock(&ifp->if_flt_lock);
3041 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3042 				if (entry != filter || entry->filt_skip) {
3043 					continue;
3044 				}
3045 				/*
3046 				 * We've found a match; since it's possible
3047 				 * that the thread gets blocked in the monitor,
3048 				 * we do the lock dance.  Interface should
3049 				 * not be detached since we still have a use
3050 				 * count held during filter attach.
3051 				 */
3052 				entry->filt_skip = 1;   /* skip input/output */
3053 				lck_mtx_unlock(&ifp->if_flt_lock);
3054 				ifnet_head_done();
3055 
3056 				lck_mtx_lock(&ifp->if_flt_lock);
3057 				if_flt_monitor_enter(ifp);
3058 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
3059 				    LCK_MTX_ASSERT_OWNED);
3060 
3061 				/* Remove the filter from the list */
3062 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
3063 				    filt_next);
3064 
3065 				if (dlil_verbose) {
3066 					DLIL_PRINTF("%s: %s filter detached\n",
3067 					    if_name(ifp), filter->filt_name);
3068 				}
3069 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3070 					VERIFY(ifp->if_flt_non_os_count != 0);
3071 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3072 				}
3073 				/*
3074 				 * Decrease filter count and route_generation
3075 				 * ID to let TCP know it should reevalute doing
3076 				 * TSO or not.
3077 				 */
3078 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3079 					ifnet_filter_update_tso(ifp, FALSE);
3080 				}
3081 				if_flt_monitor_leave(ifp);
3082 				lck_mtx_unlock(&ifp->if_flt_lock);
3083 				goto destroy;
3084 			}
3085 			lck_mtx_unlock(&ifp->if_flt_lock);
3086 		}
3087 		ifnet_head_done();
3088 
3089 		/* filter parameter is not a valid filter ref */
3090 		retval = EINVAL;
3091 		goto done;
3092 	} else {
3093 		struct ifnet *ifp = filter->filt_ifp;
3094 		/*
3095 		 * Here we are called from ifnet_detach_final(); the
3096 		 * caller had emptied if_flt_head and we're doing an
3097 		 * implicit filter detach because the interface is
3098 		 * about to go away.  Make sure to adjust the counters
3099 		 * in this case.  We don't need the protection of the
3100 		 * filter monitor since we're called as part of the
3101 		 * final detach in the context of the detacher thread.
3102 		 */
3103 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3104 			VERIFY(ifp->if_flt_non_os_count != 0);
3105 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3106 		}
3107 		/*
3108 		 * Decrease filter count and route_generation
3109 		 * ID to let TCP know it should reevalute doing
3110 		 * TSO or not.
3111 		 */
3112 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3113 			ifnet_filter_update_tso(ifp, FALSE);
3114 		}
3115 	}
3116 
3117 	if (dlil_verbose) {
3118 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3119 	}
3120 
3121 destroy:
3122 
3123 	/* Call the detached function if there is one */
3124 	if (filter->filt_detached) {
3125 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3126 	}
3127 
3128 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3129 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3130 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3131 	}
3132 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3133 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3134 	    net_check_compatible_if_filter(NULL));
3135 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3136 
3137 	/* Free the filter */
3138 	zfree(dlif_filt_zone, filter);
3139 	filter = NULL;
3140 done:
3141 	if (retval != 0 && filter != NULL) {
3142 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3143 		    filter->filt_name, retval);
3144 	}
3145 
3146 	return retval;
3147 }
3148 
3149 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3150 dlil_detach_filter(interface_filter_t filter)
3151 {
3152 	if (filter == NULL) {
3153 		return;
3154 	}
3155 	dlil_detach_filter_internal(filter, 0);
3156 }
3157 
3158 __private_extern__ boolean_t
dlil_has_ip_filter(void)3159 dlil_has_ip_filter(void)
3160 {
3161 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3162 
3163 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3164 
3165 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3166 	return has_filter;
3167 }
3168 
3169 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3170 dlil_has_if_filter(struct ifnet *ifp)
3171 {
3172 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3173 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3174 	return has_filter;
3175 }
3176 
3177 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3178 dlil_input_wakeup(struct dlil_threading_info *inp)
3179 {
3180 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3181 
3182 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3183 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3184 		inp->dlth_wtot++;
3185 		wakeup_one((caddr_t)&inp->dlth_flags);
3186 	}
3187 }
3188 
3189 __attribute__((noreturn))
3190 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3191 dlil_main_input_thread_func(void *v, wait_result_t w)
3192 {
3193 #pragma unused(w)
3194 	struct dlil_threading_info *inp = v;
3195 
3196 	VERIFY(inp == dlil_main_input_thread);
3197 	VERIFY(inp->dlth_ifp == NULL);
3198 	VERIFY(current_thread() == inp->dlth_thread);
3199 
3200 	lck_mtx_lock(&inp->dlth_lock);
3201 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3202 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3203 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3204 	/* wake up once to get out of embryonic state */
3205 	dlil_input_wakeup(inp);
3206 	lck_mtx_unlock(&inp->dlth_lock);
3207 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3208 	/* NOTREACHED */
3209 	__builtin_unreachable();
3210 }
3211 
3212 /*
3213  * Main input thread:
3214  *
3215  *   a) handles all inbound packets for lo0
3216  *   b) handles all inbound packets for interfaces with no dedicated
3217  *	input thread (e.g. anything but Ethernet/PDP or those that support
3218  *	opportunistic polling.)
3219  *   c) protocol registrations
3220  *   d) packet injections
3221  */
3222 __attribute__((noreturn))
3223 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3224 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3225 {
3226 	struct dlil_main_threading_info *inpm = v;
3227 	struct dlil_threading_info *inp = v;
3228 
3229 	/* main input thread is uninterruptible */
3230 	VERIFY(wres != THREAD_INTERRUPTED);
3231 	lck_mtx_lock_spin(&inp->dlth_lock);
3232 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3233 	    DLIL_INPUT_RUNNING)));
3234 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3235 
3236 	while (1) {
3237 		struct mbuf *m = NULL, *m_loop = NULL;
3238 		u_int32_t m_cnt, m_cnt_loop;
3239 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3240 		boolean_t proto_req;
3241 		boolean_t embryonic;
3242 
3243 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3244 
3245 		if (__improbable(embryonic =
3246 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3247 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3248 		}
3249 
3250 		proto_req = (inp->dlth_flags &
3251 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3252 
3253 		/* Packets for non-dedicated interfaces other than lo0 */
3254 		m_cnt = qlen(&inp->dlth_pkts);
3255 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3256 		m = pkt.cp_mbuf;
3257 
3258 		/* Packets exclusive to lo0 */
3259 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3260 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3261 		m_loop = pkt.cp_mbuf;
3262 
3263 		inp->dlth_wtot = 0;
3264 
3265 		lck_mtx_unlock(&inp->dlth_lock);
3266 
3267 		if (__improbable(embryonic)) {
3268 			dlil_decr_pending_thread_count();
3269 		}
3270 
3271 		/*
3272 		 * NOTE warning %%% attention !!!!
3273 		 * We should think about putting some thread starvation
3274 		 * safeguards if we deal with long chains of packets.
3275 		 */
3276 		if (__probable(m_loop != NULL)) {
3277 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3278 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3279 		}
3280 
3281 		if (__probable(m != NULL)) {
3282 			dlil_input_packet_list_extended(NULL, m,
3283 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3284 		}
3285 
3286 		if (__improbable(proto_req)) {
3287 			proto_input_run();
3288 		}
3289 
3290 		lck_mtx_lock_spin(&inp->dlth_lock);
3291 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3292 		/* main input thread cannot be terminated */
3293 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3294 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3295 			break;
3296 		}
3297 	}
3298 
3299 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3300 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3301 	lck_mtx_unlock(&inp->dlth_lock);
3302 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3303 
3304 	VERIFY(0);      /* we should never get here */
3305 	/* NOTREACHED */
3306 	__builtin_unreachable();
3307 }
3308 
3309 /*
3310  * Input thread for interfaces with legacy input model.
3311  */
3312 __attribute__((noreturn))
3313 static void
dlil_input_thread_func(void * v,wait_result_t w)3314 dlil_input_thread_func(void *v, wait_result_t w)
3315 {
3316 #pragma unused(w)
3317 	char thread_name[MAXTHREADNAMESIZE];
3318 	struct dlil_threading_info *inp = v;
3319 	struct ifnet *ifp = inp->dlth_ifp;
3320 
3321 	VERIFY(inp != dlil_main_input_thread);
3322 	VERIFY(ifp != NULL);
3323 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3324 	    !(ifp->if_xflags & IFXF_LEGACY));
3325 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3326 	    !(ifp->if_xflags & IFXF_LEGACY));
3327 	VERIFY(current_thread() == inp->dlth_thread);
3328 
3329 	/* construct the name for this thread, and then apply it */
3330 	bzero(thread_name, sizeof(thread_name));
3331 	(void) snprintf(thread_name, sizeof(thread_name),
3332 	    "dlil_input_%s", ifp->if_xname);
3333 	thread_set_thread_name(inp->dlth_thread, thread_name);
3334 
3335 	lck_mtx_lock(&inp->dlth_lock);
3336 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3337 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3338 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3339 	/* wake up once to get out of embryonic state */
3340 	dlil_input_wakeup(inp);
3341 	lck_mtx_unlock(&inp->dlth_lock);
3342 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3343 	/* NOTREACHED */
3344 	__builtin_unreachable();
3345 }
3346 
3347 __attribute__((noreturn))
3348 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3349 dlil_input_thread_cont(void *v, wait_result_t wres)
3350 {
3351 	struct dlil_threading_info *inp = v;
3352 	struct ifnet *ifp = inp->dlth_ifp;
3353 
3354 	lck_mtx_lock_spin(&inp->dlth_lock);
3355 	if (__improbable(wres == THREAD_INTERRUPTED ||
3356 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3357 		goto terminate;
3358 	}
3359 
3360 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3361 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3362 
3363 	while (1) {
3364 		struct mbuf *m = NULL;
3365 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3366 		boolean_t notify = FALSE;
3367 		boolean_t embryonic;
3368 		u_int32_t m_cnt;
3369 
3370 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3371 
3372 		if (__improbable(embryonic =
3373 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3374 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3375 		}
3376 
3377 		/*
3378 		 * Protocol registration and injection must always use
3379 		 * the main input thread; in theory the latter can utilize
3380 		 * the corresponding input thread where the packet arrived
3381 		 * on, but that requires our knowing the interface in advance
3382 		 * (and the benefits might not worth the trouble.)
3383 		 */
3384 		VERIFY(!(inp->dlth_flags &
3385 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3386 
3387 		/* Packets for this interface */
3388 		m_cnt = qlen(&inp->dlth_pkts);
3389 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3390 		m = pkt.cp_mbuf;
3391 
3392 		inp->dlth_wtot = 0;
3393 
3394 #if SKYWALK
3395 		/*
3396 		 * If this interface is attached to a netif nexus,
3397 		 * the stats are already incremented there; otherwise
3398 		 * do it here.
3399 		 */
3400 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3401 #endif /* SKYWALK */
3402 		notify = dlil_input_stats_sync(ifp, inp);
3403 
3404 		lck_mtx_unlock(&inp->dlth_lock);
3405 
3406 		if (__improbable(embryonic)) {
3407 			ifnet_decr_pending_thread_count(ifp);
3408 		}
3409 
3410 		if (__improbable(notify)) {
3411 			ifnet_notify_data_threshold(ifp);
3412 		}
3413 
3414 		/*
3415 		 * NOTE warning %%% attention !!!!
3416 		 * We should think about putting some thread starvation
3417 		 * safeguards if we deal with long chains of packets.
3418 		 */
3419 		if (__probable(m != NULL)) {
3420 			dlil_input_packet_list_extended(NULL, m,
3421 			    m_cnt, ifp->if_poll_mode);
3422 		}
3423 
3424 		lck_mtx_lock_spin(&inp->dlth_lock);
3425 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3426 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3427 		    DLIL_INPUT_TERMINATE))) {
3428 			break;
3429 		}
3430 	}
3431 
3432 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3433 
3434 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3435 terminate:
3436 		lck_mtx_unlock(&inp->dlth_lock);
3437 		dlil_terminate_input_thread(inp);
3438 		/* NOTREACHED */
3439 	} else {
3440 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3441 		lck_mtx_unlock(&inp->dlth_lock);
3442 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3443 		/* NOTREACHED */
3444 	}
3445 
3446 	VERIFY(0);      /* we should never get here */
3447 	/* NOTREACHED */
3448 	__builtin_unreachable();
3449 }
3450 
3451 /*
3452  * Input thread for interfaces with opportunistic polling input model.
3453  */
3454 __attribute__((noreturn))
3455 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3456 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3457 {
3458 #pragma unused(w)
3459 	char thread_name[MAXTHREADNAMESIZE];
3460 	struct dlil_threading_info *inp = v;
3461 	struct ifnet *ifp = inp->dlth_ifp;
3462 
3463 	VERIFY(inp != dlil_main_input_thread);
3464 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3465 	    (ifp->if_xflags & IFXF_LEGACY));
3466 	VERIFY(current_thread() == inp->dlth_thread);
3467 
3468 	/* construct the name for this thread, and then apply it */
3469 	bzero(thread_name, sizeof(thread_name));
3470 	(void) snprintf(thread_name, sizeof(thread_name),
3471 	    "dlil_input_poll_%s", ifp->if_xname);
3472 	thread_set_thread_name(inp->dlth_thread, thread_name);
3473 
3474 	lck_mtx_lock(&inp->dlth_lock);
3475 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3476 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3477 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3478 	/* wake up once to get out of embryonic state */
3479 	dlil_input_wakeup(inp);
3480 	lck_mtx_unlock(&inp->dlth_lock);
3481 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3482 	/* NOTREACHED */
3483 	__builtin_unreachable();
3484 }
3485 
3486 __attribute__((noreturn))
3487 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3488 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3489 {
3490 	struct dlil_threading_info *inp = v;
3491 	struct ifnet *ifp = inp->dlth_ifp;
3492 	struct timespec ts;
3493 
3494 	lck_mtx_lock_spin(&inp->dlth_lock);
3495 	if (__improbable(wres == THREAD_INTERRUPTED ||
3496 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3497 		goto terminate;
3498 	}
3499 
3500 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3501 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3502 
3503 	while (1) {
3504 		struct mbuf *m = NULL;
3505 		uint32_t m_cnt, poll_req = 0;
3506 		uint64_t m_size = 0;
3507 		ifnet_model_t mode;
3508 		struct timespec now, delta;
3509 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3510 		boolean_t notify;
3511 		boolean_t embryonic;
3512 		uint64_t ival;
3513 
3514 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3515 
3516 		if (__improbable(embryonic =
3517 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3518 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3519 			goto skip;
3520 		}
3521 
3522 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3523 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3524 		}
3525 
3526 		/* Link parameters changed? */
3527 		if (ifp->if_poll_update != 0) {
3528 			ifp->if_poll_update = 0;
3529 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3530 		}
3531 
3532 		/* Current operating mode */
3533 		mode = ifp->if_poll_mode;
3534 
3535 		/*
3536 		 * Protocol registration and injection must always use
3537 		 * the main input thread; in theory the latter can utilize
3538 		 * the corresponding input thread where the packet arrived
3539 		 * on, but that requires our knowing the interface in advance
3540 		 * (and the benefits might not worth the trouble.)
3541 		 */
3542 		VERIFY(!(inp->dlth_flags &
3543 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3544 
3545 		/* Total count of all packets */
3546 		m_cnt = qlen(&inp->dlth_pkts);
3547 
3548 		/* Total bytes of all packets */
3549 		m_size = qsize(&inp->dlth_pkts);
3550 
3551 		/* Packets for this interface */
3552 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3553 		m = pkt.cp_mbuf;
3554 		VERIFY(m != NULL || m_cnt == 0);
3555 
3556 		nanouptime(&now);
3557 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3558 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3559 		}
3560 
3561 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3562 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3563 			u_int32_t ptot, btot;
3564 
3565 			/* Accumulate statistics for current sampling */
3566 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3567 
3568 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3569 				goto skip;
3570 			}
3571 
3572 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3573 
3574 			/* Calculate min/max of inbound bytes */
3575 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3576 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3577 				ifp->if_rxpoll_bmin = btot;
3578 			}
3579 			if (btot > ifp->if_rxpoll_bmax) {
3580 				ifp->if_rxpoll_bmax = btot;
3581 			}
3582 
3583 			/* Calculate EWMA of inbound bytes */
3584 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3585 
3586 			/* Calculate min/max of inbound packets */
3587 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3588 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3589 				ifp->if_rxpoll_pmin = ptot;
3590 			}
3591 			if (ptot > ifp->if_rxpoll_pmax) {
3592 				ifp->if_rxpoll_pmax = ptot;
3593 			}
3594 
3595 			/* Calculate EWMA of inbound packets */
3596 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3597 
3598 			/* Reset sampling statistics */
3599 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3600 
3601 			/* Calculate EWMA of wakeup requests */
3602 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3603 			    if_rxpoll_decay);
3604 			inp->dlth_wtot = 0;
3605 
3606 			if (dlil_verbose) {
3607 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3608 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3609 				}
3610 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3611 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3612 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3613 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3614 					    "limits [%d/%d], wreq avg %d "
3615 					    "limits [%d/%d], bytes avg %d "
3616 					    "limits [%d/%d]\n", if_name(ifp),
3617 					    (ifp->if_poll_mode ==
3618 					    IFNET_MODEL_INPUT_POLL_ON) ?
3619 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3620 					    ifp->if_rxpoll_pmax,
3621 					    ifp->if_rxpoll_plowat,
3622 					    ifp->if_rxpoll_phiwat,
3623 					    ifp->if_rxpoll_wavg,
3624 					    ifp->if_rxpoll_wlowat,
3625 					    ifp->if_rxpoll_whiwat,
3626 					    ifp->if_rxpoll_bavg,
3627 					    ifp->if_rxpoll_blowat,
3628 					    ifp->if_rxpoll_bhiwat);
3629 				}
3630 			}
3631 
3632 			/* Perform mode transition, if necessary */
3633 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3634 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3635 			}
3636 
3637 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3638 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3639 				goto skip;
3640 			}
3641 
3642 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3643 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3644 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3645 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3646 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3647 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3648 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3649 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3650 				mode = IFNET_MODEL_INPUT_POLL_ON;
3651 			}
3652 
3653 			if (mode != ifp->if_poll_mode) {
3654 				ifp->if_poll_mode = mode;
3655 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3656 				poll_req++;
3657 			}
3658 		}
3659 skip:
3660 		notify = dlil_input_stats_sync(ifp, inp);
3661 
3662 		lck_mtx_unlock(&inp->dlth_lock);
3663 
3664 		if (__improbable(embryonic)) {
3665 			ifnet_decr_pending_thread_count(ifp);
3666 		}
3667 
3668 		if (__improbable(notify)) {
3669 			ifnet_notify_data_threshold(ifp);
3670 		}
3671 
3672 		/*
3673 		 * If there's a mode change and interface is still attached,
3674 		 * perform a downcall to the driver for the new mode.  Also
3675 		 * hold an IO refcnt on the interface to prevent it from
3676 		 * being detached (will be release below.)
3677 		 */
3678 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3679 			struct ifnet_model_params p = {
3680 				.model = mode, .reserved = { 0 }
3681 			};
3682 			errno_t err;
3683 
3684 			if (dlil_verbose) {
3685 				DLIL_PRINTF("%s: polling is now %s, "
3686 				    "pkts avg %d max %d limits [%d/%d], "
3687 				    "wreq avg %d limits [%d/%d], "
3688 				    "bytes avg %d limits [%d/%d]\n",
3689 				    if_name(ifp),
3690 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3691 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3692 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3693 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3694 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3695 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3696 				    ifp->if_rxpoll_bhiwat);
3697 			}
3698 
3699 			if ((err = ((*ifp->if_input_ctl)(ifp,
3700 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3701 				DLIL_PRINTF("%s: error setting polling mode "
3702 				    "to %s (%d)\n", if_name(ifp),
3703 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3704 				    "ON" : "OFF", err);
3705 			}
3706 
3707 			switch (mode) {
3708 			case IFNET_MODEL_INPUT_POLL_OFF:
3709 				ifnet_set_poll_cycle(ifp, NULL);
3710 				ifp->if_rxpoll_offreq++;
3711 				if (err != 0) {
3712 					ifp->if_rxpoll_offerr++;
3713 				}
3714 				break;
3715 
3716 			case IFNET_MODEL_INPUT_POLL_ON:
3717 				net_nsectimer(&ival, &ts);
3718 				ifnet_set_poll_cycle(ifp, &ts);
3719 				ifnet_poll(ifp);
3720 				ifp->if_rxpoll_onreq++;
3721 				if (err != 0) {
3722 					ifp->if_rxpoll_onerr++;
3723 				}
3724 				break;
3725 
3726 			default:
3727 				VERIFY(0);
3728 				/* NOTREACHED */
3729 			}
3730 
3731 			/* Release the IO refcnt */
3732 			ifnet_decr_iorefcnt(ifp);
3733 		}
3734 
3735 		/*
3736 		 * NOTE warning %%% attention !!!!
3737 		 * We should think about putting some thread starvation
3738 		 * safeguards if we deal with long chains of packets.
3739 		 */
3740 		if (__probable(m != NULL)) {
3741 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3742 		}
3743 
3744 		lck_mtx_lock_spin(&inp->dlth_lock);
3745 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3746 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3747 		    DLIL_INPUT_TERMINATE))) {
3748 			break;
3749 		}
3750 	}
3751 
3752 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3753 
3754 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3755 terminate:
3756 		lck_mtx_unlock(&inp->dlth_lock);
3757 		dlil_terminate_input_thread(inp);
3758 		/* NOTREACHED */
3759 	} else {
3760 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3761 		lck_mtx_unlock(&inp->dlth_lock);
3762 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3763 		    inp);
3764 		/* NOTREACHED */
3765 	}
3766 
3767 	VERIFY(0);      /* we should never get here */
3768 	/* NOTREACHED */
3769 	__builtin_unreachable();
3770 }
3771 
3772 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3773 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3774 {
3775 	if (p != NULL) {
3776 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3777 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3778 			return EINVAL;
3779 		}
3780 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3781 		    p->packets_lowat >= p->packets_hiwat) {
3782 			return EINVAL;
3783 		}
3784 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3785 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3786 			return EINVAL;
3787 		}
3788 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3789 		    p->bytes_lowat >= p->bytes_hiwat) {
3790 			return EINVAL;
3791 		}
3792 		if (p->interval_time != 0 &&
3793 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3794 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3795 		}
3796 	}
3797 	return 0;
3798 }
3799 
3800 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3801 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3802 {
3803 	u_int64_t sample_holdtime, inbw;
3804 
3805 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3806 		sample_holdtime = 0;    /* polling is disabled */
3807 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3808 		    ifp->if_rxpoll_blowat = 0;
3809 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3810 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3811 		ifp->if_rxpoll_plim = 0;
3812 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3813 	} else {
3814 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3815 		u_int64_t ival;
3816 		unsigned int n, i;
3817 
3818 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3819 			if (inbw < rxpoll_tbl[i].speed) {
3820 				break;
3821 			}
3822 			n = i;
3823 		}
3824 		/* auto-tune if caller didn't specify a value */
3825 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3826 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3827 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3828 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3829 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3830 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3831 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3832 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3833 		plim = ((p == NULL || p->packets_limit == 0 ||
3834 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3835 		ival = ((p == NULL || p->interval_time == 0 ||
3836 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3837 		    if_rxpoll_interval_time : p->interval_time);
3838 
3839 		VERIFY(plowat != 0 && phiwat != 0);
3840 		VERIFY(blowat != 0 && bhiwat != 0);
3841 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3842 
3843 		sample_holdtime = if_rxpoll_sample_holdtime;
3844 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3845 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3846 		ifp->if_rxpoll_plowat = plowat;
3847 		ifp->if_rxpoll_phiwat = phiwat;
3848 		ifp->if_rxpoll_blowat = blowat;
3849 		ifp->if_rxpoll_bhiwat = bhiwat;
3850 		ifp->if_rxpoll_plim = plim;
3851 		ifp->if_rxpoll_ival = ival;
3852 	}
3853 
3854 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3855 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3856 
3857 	if (dlil_verbose) {
3858 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3859 		    "poll interval %llu nsec, pkts per poll %u, "
3860 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3861 		    "bytes limits [%u/%u]\n", if_name(ifp),
3862 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3863 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3864 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3865 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3866 		    ifp->if_rxpoll_bhiwat);
3867 	}
3868 }
3869 
3870 /*
3871  * Must be called on an attached ifnet (caller is expected to check.)
3872  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3873  */
3874 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3875 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3876     boolean_t locked)
3877 {
3878 	errno_t err;
3879 	struct dlil_threading_info *inp;
3880 
3881 	VERIFY(ifp != NULL);
3882 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3883 		return ENXIO;
3884 	}
3885 	err = dlil_rxpoll_validate_params(p);
3886 	if (err != 0) {
3887 		return err;
3888 	}
3889 
3890 	if (!locked) {
3891 		lck_mtx_lock(&inp->dlth_lock);
3892 	}
3893 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3894 	/*
3895 	 * Normally, we'd reset the parameters to the auto-tuned values
3896 	 * if the the input thread detects a change in link rate.  If the
3897 	 * driver provides its own parameters right after a link rate
3898 	 * changes, but before the input thread gets to run, we want to
3899 	 * make sure to keep the driver's values.  Clearing if_poll_update
3900 	 * will achieve that.
3901 	 */
3902 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3903 		ifp->if_poll_update = 0;
3904 	}
3905 	dlil_rxpoll_update_params(ifp, p);
3906 	if (!locked) {
3907 		lck_mtx_unlock(&inp->dlth_lock);
3908 	}
3909 	return 0;
3910 }
3911 
3912 /*
3913  * Must be called on an attached ifnet (caller is expected to check.)
3914  */
3915 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3916 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3917 {
3918 	struct dlil_threading_info *inp;
3919 
3920 	VERIFY(ifp != NULL && p != NULL);
3921 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3922 		return ENXIO;
3923 	}
3924 
3925 	bzero(p, sizeof(*p));
3926 
3927 	lck_mtx_lock(&inp->dlth_lock);
3928 	p->packets_limit = ifp->if_rxpoll_plim;
3929 	p->packets_lowat = ifp->if_rxpoll_plowat;
3930 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3931 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3932 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3933 	p->interval_time = ifp->if_rxpoll_ival;
3934 	lck_mtx_unlock(&inp->dlth_lock);
3935 
3936 	return 0;
3937 }
3938 
3939 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3940 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3941     const struct ifnet_stat_increment_param *s)
3942 {
3943 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3944 }
3945 
3946 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3947 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3948     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3949 {
3950 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3951 }
3952 
3953 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3954 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3955     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3956 {
3957 	return ifnet_input_common(ifp, m_head, m_tail, s,
3958 	           (m_head != NULL), TRUE);
3959 }
3960 
3961 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3962 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3964 {
3965 	dlil_input_func input_func;
3966 	struct ifnet_stat_increment_param _s;
3967 	u_int32_t m_cnt = 0, m_size = 0;
3968 	struct mbuf *last;
3969 	errno_t err = 0;
3970 
3971 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3972 		if (m_head != NULL) {
3973 			mbuf_freem_list(m_head);
3974 		}
3975 		return EINVAL;
3976 	}
3977 
3978 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3979 	VERIFY(m_tail == NULL || ext);
3980 	VERIFY(s != NULL || !ext);
3981 
3982 	/*
3983 	 * Drop the packet(s) if the parameters are invalid, or if the
3984 	 * interface is no longer attached; else hold an IO refcnt to
3985 	 * prevent it from being detached (will be released below.)
3986 	 */
3987 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3988 		if (m_head != NULL) {
3989 			mbuf_freem_list(m_head);
3990 		}
3991 		return EINVAL;
3992 	}
3993 
3994 	input_func = ifp->if_input_dlil;
3995 	VERIFY(input_func != NULL);
3996 
3997 	if (m_tail == NULL) {
3998 		last = m_head;
3999 		while (m_head != NULL) {
4000 #if IFNET_INPUT_SANITY_CHK
4001 			if (__improbable(dlil_input_sanity_check != 0)) {
4002 				DLIL_INPUT_CHECK(last, ifp);
4003 			}
4004 #endif /* IFNET_INPUT_SANITY_CHK */
4005 			m_cnt++;
4006 			m_size += m_length(last);
4007 			if (mbuf_nextpkt(last) == NULL) {
4008 				break;
4009 			}
4010 			last = mbuf_nextpkt(last);
4011 		}
4012 		m_tail = last;
4013 	} else {
4014 #if IFNET_INPUT_SANITY_CHK
4015 		if (__improbable(dlil_input_sanity_check != 0)) {
4016 			last = m_head;
4017 			while (1) {
4018 				DLIL_INPUT_CHECK(last, ifp);
4019 				m_cnt++;
4020 				m_size += m_length(last);
4021 				if (mbuf_nextpkt(last) == NULL) {
4022 					break;
4023 				}
4024 				last = mbuf_nextpkt(last);
4025 			}
4026 		} else {
4027 			m_cnt = s->packets_in;
4028 			m_size = s->bytes_in;
4029 			last = m_tail;
4030 		}
4031 #else
4032 		m_cnt = s->packets_in;
4033 		m_size = s->bytes_in;
4034 		last = m_tail;
4035 #endif /* IFNET_INPUT_SANITY_CHK */
4036 	}
4037 
4038 	if (last != m_tail) {
4039 		panic_plain("%s: invalid input packet chain for %s, "
4040 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4041 		    m_tail, last);
4042 	}
4043 
4044 	/*
4045 	 * Assert packet count only for the extended variant, for backwards
4046 	 * compatibility, since this came directly from the device driver.
4047 	 * Relax this assertion for input bytes, as the driver may have
4048 	 * included the link-layer headers in the computation; hence
4049 	 * m_size is just an approximation.
4050 	 */
4051 	if (ext && s->packets_in != m_cnt) {
4052 		panic_plain("%s: input packet count mismatch for %s, "
4053 		    "%d instead of %d\n", __func__, if_name(ifp),
4054 		    s->packets_in, m_cnt);
4055 	}
4056 
4057 	if (s == NULL) {
4058 		bzero(&_s, sizeof(_s));
4059 		s = &_s;
4060 	} else {
4061 		_s = *s;
4062 	}
4063 	_s.packets_in = m_cnt;
4064 	_s.bytes_in = m_size;
4065 
4066 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4067 
4068 	if (ifp != lo_ifp) {
4069 		/* Release the IO refcnt */
4070 		ifnet_datamov_end(ifp);
4071 	}
4072 
4073 	return err;
4074 }
4075 
4076 #if SKYWALK
4077 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4078 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4079 {
4080 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4081 	           ptrauth_nop_cast(void *, &dlil_input_handler),
4082 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4083 }
4084 
4085 void
dlil_reset_input_handler(struct ifnet * ifp)4086 dlil_reset_input_handler(struct ifnet *ifp)
4087 {
4088 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4089 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
4090 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4091 		;
4092 	}
4093 }
4094 
4095 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4096 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4097 {
4098 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4099 	           ptrauth_nop_cast(void *, &dlil_output_handler),
4100 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4101 }
4102 
4103 void
dlil_reset_output_handler(struct ifnet * ifp)4104 dlil_reset_output_handler(struct ifnet *ifp)
4105 {
4106 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4107 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
4108 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4109 		;
4110 	}
4111 }
4112 #endif /* SKYWALK */
4113 
4114 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4115 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4116 {
4117 	return ifp->if_output(ifp, m);
4118 }
4119 
4120 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4121 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4122     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4123     boolean_t poll, struct thread *tp)
4124 {
4125 	struct dlil_threading_info *inp = ifp->if_inp;
4126 
4127 	if (__improbable(inp == NULL)) {
4128 		inp = dlil_main_input_thread;
4129 	}
4130 
4131 #if (DEVELOPMENT || DEBUG)
4132 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4133 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4134 	} else
4135 #endif /* (DEVELOPMENT || DEBUG) */
4136 	{
4137 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4138 	}
4139 }
4140 
4141 /*
4142  * Detect whether a queue contains a burst that needs to be trimmed.
4143  */
4144 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
4145 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
4146 	                        qtype(q) == QP_MBUF)
4147 
4148 #define MAX_KNOWN_MBUF_CLASS 8
4149 
4150 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4151 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4152     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4153 {
4154 	uint32_t overcommitted_qlen;    /* Length in packets. */
4155 	uint64_t overcommitted_qsize;   /* Size in bytes. */
4156 	uint32_t target_qlen;                   /* The desired queue length after trimming. */
4157 	uint32_t pkts_to_drop;                  /* Number of packets to drop. */
4158 	uint32_t dropped_pkts = 0;              /* Number of packets that were dropped. */
4159 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
4160 	struct mbuf *m = NULL, *m_tmp = NULL;
4161 
4162 	overcommitted_qlen = qlen(input_queue);
4163 	overcommitted_qsize = qsize(input_queue);
4164 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4165 
4166 	if (overcommitted_qlen <= target_qlen) {
4167 		/*
4168 		 * The queue is already within the target limits.
4169 		 */
4170 		dropped_pkts = 0;
4171 		goto out;
4172 	}
4173 
4174 	pkts_to_drop = overcommitted_qlen - target_qlen;
4175 
4176 	/*
4177 	 * Proceed to removing packets from the head of the queue,
4178 	 * starting from the oldest, until the desired number of packets
4179 	 * has been dropped.
4180 	 */
4181 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4182 		if (pkts_to_drop <= dropped_pkts) {
4183 			break;
4184 		}
4185 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
4186 		MBUFQ_NEXT(m) = NULL;
4187 		MBUFQ_ENQUEUE(freeq, m);
4188 
4189 		dropped_pkts += 1;
4190 		dropped_bytes += m_length(m);
4191 	}
4192 
4193 	/*
4194 	 * Adjust the length and the estimated size of the queue
4195 	 * after trimming.
4196 	 */
4197 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4198 	qlen(input_queue) = target_qlen;
4199 
4200 	/* qsize() is an approximation. */
4201 	if (dropped_bytes < qsize(input_queue)) {
4202 		qsize(input_queue) -= dropped_bytes;
4203 	} else {
4204 		qsize(input_queue) = 0;
4205 	}
4206 
4207 	/*
4208 	 * Adjust the ifnet statistics increments, if needed.
4209 	 */
4210 	stat_delta->dropped += dropped_pkts;
4211 	if (dropped_pkts < stat_delta->packets_in) {
4212 		stat_delta->packets_in -= dropped_pkts;
4213 	} else {
4214 		stat_delta->packets_in = 0;
4215 	}
4216 	if (dropped_bytes < stat_delta->bytes_in) {
4217 		stat_delta->bytes_in -= dropped_bytes;
4218 	} else {
4219 		stat_delta->bytes_in = 0;
4220 	}
4221 
4222 out:
4223 	if (dlil_verbose) {
4224 		/*
4225 		 * The basic information about the drop is logged
4226 		 * by the invoking function (dlil_input_{,a}sync).
4227 		 * If `dlil_verbose' flag is set, provide more information
4228 		 * that can be useful for debugging.
4229 		 */
4230 		DLIL_PRINTF("%s: "
4231 		    "qlen: %u -> %u, "
4232 		    "qsize: %llu -> %llu "
4233 		    "qlimit: %u (sysctl: %u) "
4234 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4235 		    "dropped_pkts: %u dropped_bytes %u\n",
4236 		    __func__,
4237 		    overcommitted_qlen, qlen(input_queue),
4238 		    overcommitted_qsize, qsize(input_queue),
4239 		    qlimit(input_queue), if_rcvq_burst_limit,
4240 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4241 		    dropped_pkts, dropped_bytes);
4242 	}
4243 
4244 	return dropped_pkts;
4245 }
4246 
4247 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4248 dlil_input_async(struct dlil_threading_info *inp,
4249     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4250     const struct ifnet_stat_increment_param *s, boolean_t poll,
4251     struct thread *tp)
4252 {
4253 	u_int32_t m_cnt = s->packets_in;
4254 	u_int32_t m_size = s->bytes_in;
4255 	boolean_t notify = FALSE;
4256 	struct ifnet_stat_increment_param s_adj = *s;
4257 	dlil_freeq_t freeq;
4258 	MBUFQ_INIT(&freeq);
4259 
4260 	/*
4261 	 * If there is a matching DLIL input thread associated with an
4262 	 * affinity set, associate this thread with the same set.  We
4263 	 * will only do this once.
4264 	 */
4265 	lck_mtx_lock_spin(&inp->dlth_lock);
4266 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4267 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4268 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4269 		u_int32_t tag = inp->dlth_affinity_tag;
4270 
4271 		if (poll) {
4272 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 			inp->dlth_poller_thread = tp;
4274 		} else {
4275 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4276 			inp->dlth_driver_thread = tp;
4277 		}
4278 		lck_mtx_unlock(&inp->dlth_lock);
4279 
4280 		/* Associate the current thread with the new affinity tag */
4281 		(void) dlil_affinity_set(tp, tag);
4282 
4283 		/*
4284 		 * Take a reference on the current thread; during detach,
4285 		 * we will need to refer to it in order to tear down its
4286 		 * affinity.
4287 		 */
4288 		thread_reference(tp);
4289 		lck_mtx_lock_spin(&inp->dlth_lock);
4290 	}
4291 
4292 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4293 
4294 	/*
4295 	 * Because of loopbacked multicast we cannot stuff the ifp in
4296 	 * the rcvif of the packet header: loopback (lo0) packets use a
4297 	 * dedicated list so that we can later associate them with lo_ifp
4298 	 * on their way up the stack.  Packets for other interfaces without
4299 	 * dedicated input threads go to the regular list.
4300 	 */
4301 	if (m_head != NULL) {
4302 		classq_pkt_t head, tail;
4303 		class_queue_t *input_queue;
4304 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4305 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4306 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4307 			struct dlil_main_threading_info *inpm =
4308 			    (struct dlil_main_threading_info *)inp;
4309 			input_queue = &inpm->lo_rcvq_pkts;
4310 		} else {
4311 			input_queue = &inp->dlth_pkts;
4312 		}
4313 
4314 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4315 
4316 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4317 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4318 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
4319 			inp->dlth_trim_cnt += 1;
4320 
4321 			os_log_error(OS_LOG_DEFAULT,
4322 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
4323 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
4324 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4325 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4326 			    qlen(input_queue));
4327 		}
4328 	}
4329 
4330 #if IFNET_INPUT_SANITY_CHK
4331 	/*
4332 	 * Verify that the original stat increment parameter
4333 	 * accurately describes the input chain `m_head`.
4334 	 * This is not affected by the trimming of input queue.
4335 	 */
4336 	if (__improbable(dlil_input_sanity_check != 0)) {
4337 		u_int32_t count = 0, size = 0;
4338 		struct mbuf *m0;
4339 
4340 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4341 			size += m_length(m0);
4342 			count++;
4343 		}
4344 
4345 		if (count != m_cnt) {
4346 			panic_plain("%s: invalid total packet count %u "
4347 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4348 			/* NOTREACHED */
4349 			__builtin_unreachable();
4350 		} else if (size != m_size) {
4351 			panic_plain("%s: invalid total packet size %u "
4352 			    "(expected %u)\n", if_name(ifp), size, m_size);
4353 			/* NOTREACHED */
4354 			__builtin_unreachable();
4355 		}
4356 
4357 		inp->dlth_pkts_cnt += m_cnt;
4358 	}
4359 #endif /* IFNET_INPUT_SANITY_CHK */
4360 
4361 	/* NOTE: use the adjusted parameter, vs the original one */
4362 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4363 	/*
4364 	 * If we're using the main input thread, synchronize the
4365 	 * stats now since we have the interface context.  All
4366 	 * other cases involving dedicated input threads will
4367 	 * have their stats synchronized there.
4368 	 */
4369 	if (inp == dlil_main_input_thread) {
4370 		notify = dlil_input_stats_sync(ifp, inp);
4371 	}
4372 
4373 	dlil_input_wakeup(inp);
4374 	lck_mtx_unlock(&inp->dlth_lock);
4375 
4376 	/*
4377 	 * Actual freeing of the excess packets must happen
4378 	 * after the dlth_lock had been released.
4379 	 */
4380 	if (!MBUFQ_EMPTY(&freeq)) {
4381 		m_freem_list(MBUFQ_FIRST(&freeq));
4382 	}
4383 
4384 	if (notify) {
4385 		ifnet_notify_data_threshold(ifp);
4386 	}
4387 
4388 	return 0;
4389 }
4390 
4391 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4392 dlil_input_sync(struct dlil_threading_info *inp,
4393     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4394     const struct ifnet_stat_increment_param *s, boolean_t poll,
4395     struct thread *tp)
4396 {
4397 #pragma unused(tp)
4398 	u_int32_t m_cnt = s->packets_in;
4399 	u_int32_t m_size = s->bytes_in;
4400 	boolean_t notify = FALSE;
4401 	classq_pkt_t head, tail;
4402 	struct ifnet_stat_increment_param s_adj = *s;
4403 	dlil_freeq_t freeq;
4404 	MBUFQ_INIT(&freeq);
4405 
4406 	ASSERT(inp != dlil_main_input_thread);
4407 
4408 	/* XXX: should we just assert instead? */
4409 	if (__improbable(m_head == NULL)) {
4410 		return 0;
4411 	}
4412 
4413 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4414 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4415 
4416 	lck_mtx_lock_spin(&inp->dlth_lock);
4417 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4418 
4419 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4420 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4421 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4422 		inp->dlth_trim_cnt += 1;
4423 
4424 		os_log_error(OS_LOG_DEFAULT,
4425 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4426 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4427 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4428 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4429 		    qlen(&inp->dlth_pkts));
4430 	}
4431 
4432 #if IFNET_INPUT_SANITY_CHK
4433 	if (__improbable(dlil_input_sanity_check != 0)) {
4434 		u_int32_t count = 0, size = 0;
4435 		struct mbuf *m0;
4436 
4437 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4438 			size += m_length(m0);
4439 			count++;
4440 		}
4441 
4442 		if (count != m_cnt) {
4443 			panic_plain("%s: invalid total packet count %u "
4444 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4445 			/* NOTREACHED */
4446 			__builtin_unreachable();
4447 		} else if (size != m_size) {
4448 			panic_plain("%s: invalid total packet size %u "
4449 			    "(expected %u)\n", if_name(ifp), size, m_size);
4450 			/* NOTREACHED */
4451 			__builtin_unreachable();
4452 		}
4453 
4454 		inp->dlth_pkts_cnt += m_cnt;
4455 	}
4456 #endif /* IFNET_INPUT_SANITY_CHK */
4457 
4458 	/* NOTE: use the adjusted parameter, vs the original one */
4459 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4460 
4461 	m_cnt = qlen(&inp->dlth_pkts);
4462 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4463 
4464 #if SKYWALK
4465 	/*
4466 	 * If this interface is attached to a netif nexus,
4467 	 * the stats are already incremented there; otherwise
4468 	 * do it here.
4469 	 */
4470 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4471 #endif /* SKYWALK */
4472 	notify = dlil_input_stats_sync(ifp, inp);
4473 
4474 	lck_mtx_unlock(&inp->dlth_lock);
4475 
4476 	/*
4477 	 * Actual freeing of the excess packets must happen
4478 	 * after the dlth_lock had been released.
4479 	 */
4480 	if (!MBUFQ_EMPTY(&freeq)) {
4481 		m_freem_list(MBUFQ_FIRST(&freeq));
4482 	}
4483 
4484 	if (notify) {
4485 		ifnet_notify_data_threshold(ifp);
4486 	}
4487 
4488 	/*
4489 	 * NOTE warning %%% attention !!!!
4490 	 * We should think about putting some thread starvation
4491 	 * safeguards if we deal with long chains of packets.
4492 	 */
4493 	if (head.cp_mbuf != NULL) {
4494 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4495 		    m_cnt, ifp->if_poll_mode);
4496 	}
4497 
4498 	return 0;
4499 }
4500 
4501 #if SKYWALK
4502 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4503 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4504 {
4505 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4506 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4507 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4508 }
4509 
4510 void
ifnet_reset_output_handler(struct ifnet * ifp)4511 ifnet_reset_output_handler(struct ifnet *ifp)
4512 {
4513 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4514 	    ptrauth_nop_cast(void *, ifp->if_output),
4515 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4516 		;
4517 	}
4518 }
4519 
4520 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4521 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4522 {
4523 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4524 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4525 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4526 }
4527 
4528 void
ifnet_reset_start_handler(struct ifnet * ifp)4529 ifnet_reset_start_handler(struct ifnet *ifp)
4530 {
4531 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4532 	    ptrauth_nop_cast(void *, ifp->if_start),
4533 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4534 		;
4535 	}
4536 }
4537 #endif /* SKYWALK */
4538 
4539 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4540 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4541 {
4542 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4543 		return;
4544 	}
4545 	/*
4546 	 * If the starter thread is inactive, signal it to do work,
4547 	 * unless the interface is being flow controlled from below,
4548 	 * e.g. a virtual interface being flow controlled by a real
4549 	 * network interface beneath it, or it's been disabled via
4550 	 * a call to ifnet_disable_output().
4551 	 */
4552 	lck_mtx_lock_spin(&ifp->if_start_lock);
4553 	if (ignore_delay) {
4554 		ifp->if_start_flags |= IFSF_NO_DELAY;
4555 	}
4556 	if (resetfc) {
4557 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4558 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4559 		lck_mtx_unlock(&ifp->if_start_lock);
4560 		return;
4561 	}
4562 	ifp->if_start_req++;
4563 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4564 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4565 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4566 	    ifp->if_start_delayed == 0)) {
4567 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4568 	}
4569 	lck_mtx_unlock(&ifp->if_start_lock);
4570 }
4571 
4572 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4573 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4574 {
4575 	ifp->if_start_pacemaker_time = tx_time;
4576 }
4577 
4578 void
ifnet_start(struct ifnet * ifp)4579 ifnet_start(struct ifnet *ifp)
4580 {
4581 	ifnet_start_common(ifp, FALSE, FALSE);
4582 }
4583 
4584 void
ifnet_start_ignore_delay(struct ifnet * ifp)4585 ifnet_start_ignore_delay(struct ifnet *ifp)
4586 {
4587 	ifnet_start_common(ifp, FALSE, TRUE);
4588 }
4589 
4590 __attribute__((noreturn))
4591 static void
ifnet_start_thread_func(void * v,wait_result_t w)4592 ifnet_start_thread_func(void *v, wait_result_t w)
4593 {
4594 #pragma unused(w)
4595 	struct ifnet *ifp = v;
4596 	char thread_name[MAXTHREADNAMESIZE];
4597 
4598 	/* Construct the name for this thread, and then apply it. */
4599 	bzero(thread_name, sizeof(thread_name));
4600 	(void) snprintf(thread_name, sizeof(thread_name),
4601 	    "ifnet_start_%s", ifp->if_xname);
4602 #if SKYWALK
4603 	/* override name for native Skywalk interface */
4604 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4605 		(void) snprintf(thread_name, sizeof(thread_name),
4606 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4607 	}
4608 #endif /* SKYWALK */
4609 	ASSERT(ifp->if_start_thread == current_thread());
4610 	thread_set_thread_name(current_thread(), thread_name);
4611 
4612 	/*
4613 	 * Treat the dedicated starter thread for lo0 as equivalent to
4614 	 * the driver workloop thread; if net_affinity is enabled for
4615 	 * the main input thread, associate this starter thread to it
4616 	 * by binding them with the same affinity tag.  This is done
4617 	 * only once (as we only have one lo_ifp which never goes away.)
4618 	 */
4619 	if (ifp == lo_ifp) {
4620 		struct dlil_threading_info *inp = dlil_main_input_thread;
4621 		struct thread *tp = current_thread();
4622 #if SKYWALK
4623 		/* native skywalk loopback not yet implemented */
4624 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4625 #endif /* SKYWALK */
4626 
4627 		lck_mtx_lock(&inp->dlth_lock);
4628 		if (inp->dlth_affinity) {
4629 			u_int32_t tag = inp->dlth_affinity_tag;
4630 
4631 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4632 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4633 			inp->dlth_driver_thread = tp;
4634 			lck_mtx_unlock(&inp->dlth_lock);
4635 
4636 			/* Associate this thread with the affinity tag */
4637 			(void) dlil_affinity_set(tp, tag);
4638 		} else {
4639 			lck_mtx_unlock(&inp->dlth_lock);
4640 		}
4641 	}
4642 
4643 	lck_mtx_lock(&ifp->if_start_lock);
4644 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4645 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4646 	ifp->if_start_embryonic = 1;
4647 	/* wake up once to get out of embryonic state */
4648 	ifp->if_start_req++;
4649 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4650 	lck_mtx_unlock(&ifp->if_start_lock);
4651 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4652 	/* NOTREACHED */
4653 	__builtin_unreachable();
4654 }
4655 
4656 __attribute__((noreturn))
4657 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4658 ifnet_start_thread_cont(void *v, wait_result_t wres)
4659 {
4660 	struct ifnet *ifp = v;
4661 	struct ifclassq *ifq = ifp->if_snd;
4662 
4663 	lck_mtx_lock_spin(&ifp->if_start_lock);
4664 	if (__improbable(wres == THREAD_INTERRUPTED ||
4665 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4666 		goto terminate;
4667 	}
4668 
4669 	if (__improbable(ifp->if_start_embryonic)) {
4670 		ifp->if_start_embryonic = 0;
4671 		lck_mtx_unlock(&ifp->if_start_lock);
4672 		ifnet_decr_pending_thread_count(ifp);
4673 		lck_mtx_lock_spin(&ifp->if_start_lock);
4674 		goto skip;
4675 	}
4676 
4677 	ifp->if_start_active = 1;
4678 
4679 	/*
4680 	 * Keep on servicing until no more request.
4681 	 */
4682 	for (;;) {
4683 		u_int32_t req = ifp->if_start_req;
4684 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4685 		    !IFCQ_IS_EMPTY(ifq) &&
4686 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4687 		    ifp->if_start_delayed == 0 &&
4688 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4689 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4690 			ifp->if_start_delayed = 1;
4691 			ifnet_start_delayed++;
4692 			break;
4693 		}
4694 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4695 		ifp->if_start_delayed = 0;
4696 		lck_mtx_unlock(&ifp->if_start_lock);
4697 
4698 		/*
4699 		 * If no longer attached, don't call start because ifp
4700 		 * is being destroyed; else hold an IO refcnt to
4701 		 * prevent the interface from being detached (will be
4702 		 * released below.)
4703 		 */
4704 		if (!ifnet_datamov_begin(ifp)) {
4705 			lck_mtx_lock_spin(&ifp->if_start_lock);
4706 			break;
4707 		}
4708 
4709 		/* invoke the driver's start routine */
4710 		((*ifp->if_start)(ifp));
4711 
4712 		/*
4713 		 * Release the io ref count taken above.
4714 		 */
4715 		ifnet_datamov_end(ifp);
4716 
4717 		lck_mtx_lock_spin(&ifp->if_start_lock);
4718 
4719 		/*
4720 		 * If there's no pending request or if the
4721 		 * interface has been disabled, we're done.
4722 		 */
4723 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4724 		if (req == ifp->if_start_req ||
4725 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4726 			break;
4727 		}
4728 	}
4729 skip:
4730 	ifp->if_start_req = 0;
4731 	ifp->if_start_active = 0;
4732 
4733 #if SKYWALK
4734 	/*
4735 	 * Wakeup any waiters, e.g. any threads waiting to
4736 	 * detach the interface from the flowswitch, etc.
4737 	 */
4738 	if (ifp->if_start_waiters != 0) {
4739 		ifp->if_start_waiters = 0;
4740 		wakeup(&ifp->if_start_waiters);
4741 	}
4742 #endif /* SKYWALK */
4743 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4744 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4745 		struct timespec delay_start_ts;
4746 		struct timespec pacemaker_ts;
4747 		struct timespec *ts = NULL;
4748 
4749 		/*
4750 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4751 		 * there are still packets in the send queue which haven't
4752 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4753 		 * until ifnet_start() is called again.
4754 		 */
4755 		if (ifp->if_start_pacemaker_time != 0) {
4756 			struct timespec now_ts;
4757 			uint64_t now;
4758 
4759 			nanouptime(&now_ts);
4760 			now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4761 
4762 			if (ifp->if_start_pacemaker_time != 0 &&
4763 			    ifp->if_start_pacemaker_time > now) {
4764 				pacemaker_ts.tv_sec = 0;
4765 				pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4766 
4767 				ts = &pacemaker_ts;
4768 				ifp->if_start_flags |= IFSF_NO_DELAY;
4769 				DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4770 				    uint64_t, pacemaker_ts.tv_nsec);
4771 			} else {
4772 				DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4773 				    uint64_t, now - ifp->if_start_pacemaker_time);
4774 				ifp->if_start_pacemaker_time = 0;
4775 				ifp->if_start_flags &= ~IFSF_NO_DELAY;
4776 			}
4777 		}
4778 
4779 		if (ts == NULL) {
4780 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4781 			    &ifp->if_start_cycle : NULL);
4782 		}
4783 
4784 		if (ts == NULL && ifp->if_start_delayed == 1) {
4785 			delay_start_ts.tv_sec = 0;
4786 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4787 			ts = &delay_start_ts;
4788 		}
4789 
4790 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4791 			ts = NULL;
4792 		}
4793 
4794 		if (__improbable(ts != NULL)) {
4795 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4796 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4797 		}
4798 
4799 		(void) assert_wait_deadline(&ifp->if_start_thread,
4800 		    THREAD_UNINT, deadline);
4801 		lck_mtx_unlock(&ifp->if_start_lock);
4802 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4803 		/* NOTREACHED */
4804 	} else {
4805 terminate:
4806 		/* interface is detached? */
4807 		ifnet_set_start_cycle(ifp, NULL);
4808 
4809 		ifp->if_start_pacemaker_time = 0;
4810 		/* clear if_start_thread to allow termination to continue */
4811 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4812 		ifp->if_start_thread = THREAD_NULL;
4813 		wakeup((caddr_t)&ifp->if_start_thread);
4814 		lck_mtx_unlock(&ifp->if_start_lock);
4815 
4816 		if (dlil_verbose) {
4817 			DLIL_PRINTF("%s: starter thread terminated\n",
4818 			    if_name(ifp));
4819 		}
4820 
4821 		/* for the extra refcnt from kernel_thread_start() */
4822 		thread_deallocate(current_thread());
4823 		/* this is the end */
4824 		thread_terminate(current_thread());
4825 		/* NOTREACHED */
4826 	}
4827 
4828 	/* must never get here */
4829 	VERIFY(0);
4830 	/* NOTREACHED */
4831 	__builtin_unreachable();
4832 }
4833 
4834 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4835 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4836 {
4837 	if (ts == NULL) {
4838 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4839 	} else {
4840 		*(&ifp->if_start_cycle) = *ts;
4841 	}
4842 
4843 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4844 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4845 		    if_name(ifp), ts->tv_nsec);
4846 	}
4847 }
4848 
4849 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4850 ifnet_poll_wakeup(struct ifnet *ifp)
4851 {
4852 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4853 
4854 	ifp->if_poll_req++;
4855 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4856 	    ifp->if_poll_thread != THREAD_NULL) {
4857 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4858 	}
4859 }
4860 
4861 void
ifnet_poll(struct ifnet * ifp)4862 ifnet_poll(struct ifnet *ifp)
4863 {
4864 	/*
4865 	 * If the poller thread is inactive, signal it to do work.
4866 	 */
4867 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4868 	ifnet_poll_wakeup(ifp);
4869 	lck_mtx_unlock(&ifp->if_poll_lock);
4870 }
4871 
4872 __attribute__((noreturn))
4873 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4874 ifnet_poll_thread_func(void *v, wait_result_t w)
4875 {
4876 #pragma unused(w)
4877 	char thread_name[MAXTHREADNAMESIZE];
4878 	struct ifnet *ifp = v;
4879 
4880 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4881 	VERIFY(current_thread() == ifp->if_poll_thread);
4882 
4883 	/* construct the name for this thread, and then apply it */
4884 	bzero(thread_name, sizeof(thread_name));
4885 	(void) snprintf(thread_name, sizeof(thread_name),
4886 	    "ifnet_poller_%s", ifp->if_xname);
4887 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4888 
4889 	lck_mtx_lock(&ifp->if_poll_lock);
4890 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4891 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4892 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4893 	/* wake up once to get out of embryonic state */
4894 	ifnet_poll_wakeup(ifp);
4895 	lck_mtx_unlock(&ifp->if_poll_lock);
4896 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4897 	/* NOTREACHED */
4898 	__builtin_unreachable();
4899 }
4900 
4901 __attribute__((noreturn))
4902 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4903 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4904 {
4905 	struct dlil_threading_info *inp;
4906 	struct ifnet *ifp = v;
4907 	struct ifnet_stat_increment_param s;
4908 	struct timespec start_time;
4909 
4910 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4911 
4912 	bzero(&s, sizeof(s));
4913 	net_timerclear(&start_time);
4914 
4915 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4916 	if (__improbable(wres == THREAD_INTERRUPTED ||
4917 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4918 		goto terminate;
4919 	}
4920 
4921 	inp = ifp->if_inp;
4922 	VERIFY(inp != NULL);
4923 
4924 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4925 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4926 		lck_mtx_unlock(&ifp->if_poll_lock);
4927 		ifnet_decr_pending_thread_count(ifp);
4928 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4929 		goto skip;
4930 	}
4931 
4932 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4933 
4934 	/*
4935 	 * Keep on servicing until no more request.
4936 	 */
4937 	for (;;) {
4938 		struct mbuf *m_head, *m_tail;
4939 		u_int32_t m_lim, m_cnt, m_totlen;
4940 		u_int16_t req = ifp->if_poll_req;
4941 
4942 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4943 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4944 		lck_mtx_unlock(&ifp->if_poll_lock);
4945 
4946 		/*
4947 		 * If no longer attached, there's nothing to do;
4948 		 * else hold an IO refcnt to prevent the interface
4949 		 * from being detached (will be released below.)
4950 		 */
4951 		if (!ifnet_is_attached(ifp, 1)) {
4952 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4953 			break;
4954 		}
4955 
4956 		if (dlil_verbose > 1) {
4957 			DLIL_PRINTF("%s: polling up to %d pkts, "
4958 			    "pkts avg %d max %d, wreq avg %d, "
4959 			    "bytes avg %d\n",
4960 			    if_name(ifp), m_lim,
4961 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4962 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4963 		}
4964 
4965 		/* invoke the driver's input poll routine */
4966 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4967 		&m_cnt, &m_totlen));
4968 
4969 		if (m_head != NULL) {
4970 			VERIFY(m_tail != NULL && m_cnt > 0);
4971 
4972 			if (dlil_verbose > 1) {
4973 				DLIL_PRINTF("%s: polled %d pkts, "
4974 				    "pkts avg %d max %d, wreq avg %d, "
4975 				    "bytes avg %d\n",
4976 				    if_name(ifp), m_cnt,
4977 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4978 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4979 			}
4980 
4981 			/* stats are required for extended variant */
4982 			s.packets_in = m_cnt;
4983 			s.bytes_in = m_totlen;
4984 
4985 			(void) ifnet_input_common(ifp, m_head, m_tail,
4986 			    &s, TRUE, TRUE);
4987 		} else {
4988 			if (dlil_verbose > 1) {
4989 				DLIL_PRINTF("%s: no packets, "
4990 				    "pkts avg %d max %d, wreq avg %d, "
4991 				    "bytes avg %d\n",
4992 				    if_name(ifp), ifp->if_rxpoll_pavg,
4993 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4994 				    ifp->if_rxpoll_bavg);
4995 			}
4996 
4997 			(void) ifnet_input_common(ifp, NULL, NULL,
4998 			    NULL, FALSE, TRUE);
4999 		}
5000 
5001 		/* Release the io ref count */
5002 		ifnet_decr_iorefcnt(ifp);
5003 
5004 		lck_mtx_lock_spin(&ifp->if_poll_lock);
5005 
5006 		/* if there's no pending request, we're done */
5007 		if (req == ifp->if_poll_req ||
5008 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5009 			break;
5010 		}
5011 	}
5012 skip:
5013 	ifp->if_poll_req = 0;
5014 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5015 
5016 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5017 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5018 		struct timespec *ts;
5019 
5020 		/*
5021 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5022 		 * until ifnet_poll() is called again.
5023 		 */
5024 		ts = &ifp->if_poll_cycle;
5025 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5026 			ts = NULL;
5027 		}
5028 
5029 		if (ts != NULL) {
5030 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5031 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5032 		}
5033 
5034 		(void) assert_wait_deadline(&ifp->if_poll_thread,
5035 		    THREAD_UNINT, deadline);
5036 		lck_mtx_unlock(&ifp->if_poll_lock);
5037 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5038 		/* NOTREACHED */
5039 	} else {
5040 terminate:
5041 		/* interface is detached (maybe while asleep)? */
5042 		ifnet_set_poll_cycle(ifp, NULL);
5043 
5044 		/* clear if_poll_thread to allow termination to continue */
5045 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
5046 		ifp->if_poll_thread = THREAD_NULL;
5047 		wakeup((caddr_t)&ifp->if_poll_thread);
5048 		lck_mtx_unlock(&ifp->if_poll_lock);
5049 
5050 		if (dlil_verbose) {
5051 			DLIL_PRINTF("%s: poller thread terminated\n",
5052 			    if_name(ifp));
5053 		}
5054 
5055 		/* for the extra refcnt from kernel_thread_start() */
5056 		thread_deallocate(current_thread());
5057 		/* this is the end */
5058 		thread_terminate(current_thread());
5059 		/* NOTREACHED */
5060 	}
5061 
5062 	/* must never get here */
5063 	VERIFY(0);
5064 	/* NOTREACHED */
5065 	__builtin_unreachable();
5066 }
5067 
5068 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5069 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5070 {
5071 	if (ts == NULL) {
5072 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5073 	} else {
5074 		*(&ifp->if_poll_cycle) = *ts;
5075 	}
5076 
5077 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5078 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5079 		    if_name(ifp), ts->tv_nsec);
5080 	}
5081 }
5082 
5083 void
ifnet_purge(struct ifnet * ifp)5084 ifnet_purge(struct ifnet *ifp)
5085 {
5086 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5087 		if_qflush_snd(ifp, false);
5088 	}
5089 }
5090 
5091 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5092 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5093 {
5094 	IFCQ_LOCK_ASSERT_HELD(ifq);
5095 
5096 	if (!(IFCQ_IS_READY(ifq))) {
5097 		return;
5098 	}
5099 
5100 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
5101 		struct tb_profile tb = {
5102 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
5103 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5104 		};
5105 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
5106 	}
5107 
5108 	ifclassq_update(ifq, ev);
5109 }
5110 
5111 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5112 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5113 {
5114 	switch (ev) {
5115 	case CLASSQ_EV_LINK_BANDWIDTH:
5116 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5117 			ifp->if_poll_update++;
5118 		}
5119 		break;
5120 
5121 	default:
5122 		break;
5123 	}
5124 }
5125 
5126 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5127 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5128 {
5129 	struct ifclassq *ifq;
5130 	u_int32_t omodel;
5131 	errno_t err;
5132 
5133 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5134 		return EINVAL;
5135 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5136 		return ENXIO;
5137 	}
5138 
5139 	ifq = ifp->if_snd;
5140 	IFCQ_LOCK(ifq);
5141 	omodel = ifp->if_output_sched_model;
5142 	ifp->if_output_sched_model = model;
5143 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5144 		ifp->if_output_sched_model = omodel;
5145 	}
5146 	IFCQ_UNLOCK(ifq);
5147 
5148 	return err;
5149 }
5150 
5151 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5152 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5153 {
5154 	if (ifp == NULL) {
5155 		return EINVAL;
5156 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5157 		return ENXIO;
5158 	}
5159 
5160 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5161 
5162 	return 0;
5163 }
5164 
5165 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5166 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5167 {
5168 	if (ifp == NULL || maxqlen == NULL) {
5169 		return EINVAL;
5170 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5171 		return ENXIO;
5172 	}
5173 
5174 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5175 
5176 	return 0;
5177 }
5178 
5179 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5180 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5181 {
5182 	errno_t err;
5183 
5184 	if (ifp == NULL || pkts == NULL) {
5185 		err = EINVAL;
5186 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5187 		err = ENXIO;
5188 	} else {
5189 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5190 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
5191 	}
5192 
5193 	return err;
5194 }
5195 
5196 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5197 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5198     u_int32_t *pkts, u_int32_t *bytes)
5199 {
5200 	errno_t err;
5201 
5202 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5203 	    (pkts == NULL && bytes == NULL)) {
5204 		err = EINVAL;
5205 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5206 		err = ENXIO;
5207 	} else {
5208 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5209 		    pkts, bytes);
5210 	}
5211 
5212 	return err;
5213 }
5214 
5215 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5216 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5217 {
5218 	struct dlil_threading_info *inp;
5219 
5220 	if (ifp == NULL) {
5221 		return EINVAL;
5222 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5223 		return ENXIO;
5224 	}
5225 
5226 	if (maxqlen == 0) {
5227 		maxqlen = if_rcvq_maxlen;
5228 	} else if (maxqlen < IF_RCVQ_MINLEN) {
5229 		maxqlen = IF_RCVQ_MINLEN;
5230 	}
5231 
5232 	inp = ifp->if_inp;
5233 	lck_mtx_lock(&inp->dlth_lock);
5234 	qlimit(&inp->dlth_pkts) = maxqlen;
5235 	lck_mtx_unlock(&inp->dlth_lock);
5236 
5237 	return 0;
5238 }
5239 
5240 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5241 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5242 {
5243 	struct dlil_threading_info *inp;
5244 
5245 	if (ifp == NULL || maxqlen == NULL) {
5246 		return EINVAL;
5247 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5248 		return ENXIO;
5249 	}
5250 
5251 	inp = ifp->if_inp;
5252 	lck_mtx_lock(&inp->dlth_lock);
5253 	*maxqlen = qlimit(&inp->dlth_pkts);
5254 	lck_mtx_unlock(&inp->dlth_lock);
5255 	return 0;
5256 }
5257 
5258 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5259 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5260     uint16_t delay_timeout)
5261 {
5262 	if (delay_qlen > 0 && delay_timeout > 0) {
5263 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5264 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5265 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
5266 		/* convert timeout to nanoseconds */
5267 		ifp->if_start_delay_timeout *= 1000;
5268 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5269 		    ifp->if_xname, (uint32_t)delay_qlen,
5270 		    (uint32_t)delay_timeout);
5271 	} else {
5272 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5273 	}
5274 }
5275 
5276 /*
5277  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5278  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5279  * buf holds the full header.
5280  */
5281 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5282 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5283 {
5284 	struct ip *ip;
5285 	struct ip6_hdr *ip6;
5286 	uint8_t lbuf[64] __attribute__((aligned(8)));
5287 	uint8_t *p = buf;
5288 
5289 	if (ip_ver == IPVERSION) {
5290 		uint8_t old_tos;
5291 		uint32_t sum;
5292 
5293 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5294 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5295 			bcopy(buf, lbuf, sizeof(struct ip));
5296 			p = lbuf;
5297 		}
5298 		ip = (struct ip *)(void *)p;
5299 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5300 			return;
5301 		}
5302 
5303 		DTRACE_IP1(clear__v4, struct ip *, ip);
5304 		old_tos = ip->ip_tos;
5305 		ip->ip_tos &= IPTOS_ECN_MASK;
5306 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5307 		sum = (sum >> 16) + (sum & 0xffff);
5308 		ip->ip_sum = (uint16_t)(sum & 0xffff);
5309 
5310 		if (__improbable(p == lbuf)) {
5311 			bcopy(lbuf, buf, sizeof(struct ip));
5312 		}
5313 	} else {
5314 		uint32_t flow;
5315 		ASSERT(ip_ver == IPV6_VERSION);
5316 
5317 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5318 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5319 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5320 			p = lbuf;
5321 		}
5322 		ip6 = (struct ip6_hdr *)(void *)p;
5323 		flow = ntohl(ip6->ip6_flow);
5324 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5325 			return;
5326 		}
5327 
5328 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5329 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5330 
5331 		if (__improbable(p == lbuf)) {
5332 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5333 		}
5334 	}
5335 }
5336 
5337 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5338 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5339     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5340 {
5341 #if SKYWALK
5342 	volatile struct sk_nexusadv *nxadv = NULL;
5343 #endif /* SKYWALK */
5344 	volatile uint64_t *fg_ts = NULL;
5345 	volatile uint64_t *rt_ts = NULL;
5346 	struct timespec now;
5347 	u_int64_t now_nsec = 0;
5348 	int error = 0;
5349 	uint8_t *mcast_buf = NULL;
5350 	uint8_t ip_ver;
5351 	uint32_t pktlen;
5352 
5353 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
5354 #if SKYWALK
5355 	/*
5356 	 * If attached to flowswitch, grab pointers to the
5357 	 * timestamp variables in the nexus advisory region.
5358 	 */
5359 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5360 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5361 		fg_ts = &nxadv->nxadv_fg_sendts;
5362 		rt_ts = &nxadv->nxadv_rt_sendts;
5363 	}
5364 #endif /* SKYWALK */
5365 
5366 	/*
5367 	 * If packet already carries a timestamp, either from dlil_output()
5368 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
5369 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5370 	 * the timestamp value is used internally there.
5371 	 */
5372 	switch (p->cp_ptype) {
5373 	case QP_MBUF:
5374 #if SKYWALK
5375 		/*
5376 		 * Valid only for non-native (compat) Skywalk interface.
5377 		 * If the data source uses packet, caller must convert
5378 		 * it to mbuf first prior to calling this routine.
5379 		 */
5380 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5381 #endif /* SKYWALK */
5382 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5383 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5384 
5385 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5386 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5387 			nanouptime(&now);
5388 			net_timernsec(&now, &now_nsec);
5389 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5390 		}
5391 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5392 		/*
5393 		 * If the packet service class is not background,
5394 		 * update the timestamp to indicate recent activity
5395 		 * on a foreground socket.
5396 		 */
5397 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5398 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5399 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5400 			    PKTF_SO_BACKGROUND)) {
5401 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5402 				if (fg_ts != NULL) {
5403 					*fg_ts = (uint32_t)_net_uptime;
5404 				}
5405 			}
5406 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5407 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5408 				if (rt_ts != NULL) {
5409 					*rt_ts = (uint32_t)_net_uptime;
5410 				}
5411 			}
5412 		}
5413 		pktlen = m_pktlen(p->cp_mbuf);
5414 
5415 		/*
5416 		 * Some Wi-Fi AP implementations do not correctly handle
5417 		 * multicast IP packets with DSCP bits set (radr://9331522).
5418 		 * As a workaround we clear the DSCP bits but keep service
5419 		 * class (rdar://51507725).
5420 		 */
5421 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5422 		    IFNET_IS_WIFI_INFRA(ifp)) {
5423 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5424 			struct ether_header *eh;
5425 			boolean_t pullup = FALSE;
5426 			uint16_t etype;
5427 
5428 			if (__improbable(len < sizeof(struct ether_header))) {
5429 				DTRACE_IP1(small__ether, size_t, len);
5430 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5431 				    sizeof(struct ether_header))) == NULL) {
5432 					return ENOMEM;
5433 				}
5434 			}
5435 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5436 			etype = ntohs(eh->ether_type);
5437 			if (etype == ETHERTYPE_IP) {
5438 				hlen = sizeof(struct ether_header) +
5439 				    sizeof(struct ip);
5440 				if (len < hlen) {
5441 					DTRACE_IP1(small__v4, size_t, len);
5442 					pullup = TRUE;
5443 				}
5444 				ip_ver = IPVERSION;
5445 			} else if (etype == ETHERTYPE_IPV6) {
5446 				hlen = sizeof(struct ether_header) +
5447 				    sizeof(struct ip6_hdr);
5448 				if (len < hlen) {
5449 					DTRACE_IP1(small__v6, size_t, len);
5450 					pullup = TRUE;
5451 				}
5452 				ip_ver = IPV6_VERSION;
5453 			} else {
5454 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5455 				break;
5456 			}
5457 			if (pullup) {
5458 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5459 				    NULL) {
5460 					return ENOMEM;
5461 				}
5462 
5463 				eh = (struct ether_header *)mbuf_data(
5464 					p->cp_mbuf);
5465 			}
5466 			mcast_buf = (uint8_t *)(eh + 1);
5467 			/*
5468 			 * ifnet_mcast_clear_dscp() will finish the work below.
5469 			 * Note that the pullups above ensure that mcast_buf
5470 			 * points to a full IP header.
5471 			 */
5472 		}
5473 		break;
5474 
5475 #if SKYWALK
5476 	case QP_PACKET:
5477 		/*
5478 		 * Valid only for native Skywalk interface.  If the data
5479 		 * source uses mbuf, caller must convert it to packet first
5480 		 * prior to calling this routine.
5481 		 */
5482 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5483 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5484 		    p->cp_kpkt->pkt_timestamp == 0) {
5485 			nanouptime(&now);
5486 			net_timernsec(&now, &now_nsec);
5487 			p->cp_kpkt->pkt_timestamp = now_nsec;
5488 		}
5489 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5490 		/*
5491 		 * If the packet service class is not background,
5492 		 * update the timestamps on the interface, as well as
5493 		 * the ones in nexus-wide advisory to indicate recent
5494 		 * activity on a foreground flow.
5495 		 */
5496 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5497 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5498 			if (fg_ts != NULL) {
5499 				*fg_ts = (uint32_t)_net_uptime;
5500 			}
5501 		}
5502 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5503 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5504 			if (rt_ts != NULL) {
5505 				*rt_ts = (uint32_t)_net_uptime;
5506 			}
5507 		}
5508 		pktlen = p->cp_kpkt->pkt_length;
5509 
5510 		/*
5511 		 * Some Wi-Fi AP implementations do not correctly handle
5512 		 * multicast IP packets with DSCP bits set (radr://9331522).
5513 		 * As a workaround we clear the DSCP bits but keep service
5514 		 * class (rdar://51507725).
5515 		 */
5516 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5517 		    IFNET_IS_WIFI_INFRA(ifp)) {
5518 			uint8_t *baddr;
5519 			struct ether_header *eh;
5520 			uint16_t etype;
5521 
5522 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5523 			baddr += p->cp_kpkt->pkt_headroom;
5524 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5525 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5526 				    p->cp_kpkt);
5527 				break;
5528 			}
5529 			eh = (struct ether_header *)(void *)baddr;
5530 			etype = ntohs(eh->ether_type);
5531 			if (etype == ETHERTYPE_IP) {
5532 				if (pktlen < sizeof(struct ether_header) +
5533 				    sizeof(struct ip)) {
5534 					DTRACE_IP1(pkt__small__v4, uint32_t,
5535 					    pktlen);
5536 					break;
5537 				}
5538 				ip_ver = IPVERSION;
5539 			} else if (etype == ETHERTYPE_IPV6) {
5540 				if (pktlen < sizeof(struct ether_header) +
5541 				    sizeof(struct ip6_hdr)) {
5542 					DTRACE_IP1(pkt__small__v6, uint32_t,
5543 					    pktlen);
5544 					break;
5545 				}
5546 				ip_ver = IPV6_VERSION;
5547 			} else {
5548 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5549 				    etype);
5550 				break;
5551 			}
5552 			mcast_buf = (uint8_t *)(eh + 1);
5553 			/*
5554 			 * ifnet_mcast_clear_dscp() will finish the work below.
5555 			 * The checks above verify that the IP header is in the
5556 			 * first buflet.
5557 			 */
5558 		}
5559 		break;
5560 #endif /* SKYWALK */
5561 
5562 	default:
5563 		VERIFY(0);
5564 		/* NOTREACHED */
5565 		__builtin_unreachable();
5566 	}
5567 
5568 	if (mcast_buf != NULL) {
5569 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5570 	}
5571 
5572 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5573 		if (now_nsec == 0) {
5574 			nanouptime(&now);
5575 			net_timernsec(&now, &now_nsec);
5576 		}
5577 		/*
5578 		 * If the driver chose to delay start callback for
5579 		 * coalescing multiple packets, Then use the following
5580 		 * heuristics to make sure that start callback will
5581 		 * be delayed only when bulk data transfer is detected.
5582 		 * 1. number of packets enqueued in (delay_win * 2) is
5583 		 * greater than or equal to the delay qlen.
5584 		 * 2. If delay_start is enabled it will stay enabled for
5585 		 * another 10 idle windows. This is to take into account
5586 		 * variable RTT and burst traffic.
5587 		 * 3. If the time elapsed since last enqueue is more
5588 		 * than 200ms we disable delaying start callback. This is
5589 		 * is to take idle time into account.
5590 		 */
5591 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5592 		if (ifp->if_start_delay_swin > 0) {
5593 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5594 				ifp->if_start_delay_cnt++;
5595 			} else if ((now_nsec - ifp->if_start_delay_swin)
5596 			    >= (200 * 1000 * 1000)) {
5597 				ifp->if_start_delay_swin = now_nsec;
5598 				ifp->if_start_delay_cnt = 1;
5599 				ifp->if_start_delay_idle = 0;
5600 				if (ifp->if_eflags & IFEF_DELAY_START) {
5601 					if_clear_eflags(ifp, IFEF_DELAY_START);
5602 					ifnet_delay_start_disabled_increment();
5603 				}
5604 			} else {
5605 				if (ifp->if_start_delay_cnt >=
5606 				    ifp->if_start_delay_qlen) {
5607 					if_set_eflags(ifp, IFEF_DELAY_START);
5608 					ifp->if_start_delay_idle = 0;
5609 				} else {
5610 					if (ifp->if_start_delay_idle >= 10) {
5611 						if_clear_eflags(ifp,
5612 						    IFEF_DELAY_START);
5613 						ifnet_delay_start_disabled_increment();
5614 					} else {
5615 						ifp->if_start_delay_idle++;
5616 					}
5617 				}
5618 				ifp->if_start_delay_swin = now_nsec;
5619 				ifp->if_start_delay_cnt = 1;
5620 			}
5621 		} else {
5622 			ifp->if_start_delay_swin = now_nsec;
5623 			ifp->if_start_delay_cnt = 1;
5624 			ifp->if_start_delay_idle = 0;
5625 			if_clear_eflags(ifp, IFEF_DELAY_START);
5626 		}
5627 	} else {
5628 		if_clear_eflags(ifp, IFEF_DELAY_START);
5629 	}
5630 
5631 	/* enqueue the packet (caller consumes object) */
5632 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5633 	    1, pktlen, pdrop);
5634 
5635 	/*
5636 	 * Tell the driver to start dequeueing; do this even when the queue
5637 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5638 	 * be dequeueing from other unsuspended queues.
5639 	 */
5640 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5641 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5642 		ifnet_start(ifp);
5643 	}
5644 
5645 	return error;
5646 }
5647 
5648 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5649 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5650     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5651     boolean_t flush, boolean_t *pdrop)
5652 {
5653 	int error;
5654 
5655 	/* enqueue the packet (caller consumes object) */
5656 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5657 	    cnt, bytes, pdrop);
5658 
5659 	/*
5660 	 * Tell the driver to start dequeueing; do this even when the queue
5661 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5662 	 * be dequeueing from other unsuspended queues.
5663 	 */
5664 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5665 		ifnet_start(ifp);
5666 	}
5667 	return error;
5668 }
5669 
5670 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5671 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5672 {
5673 	struct ifnet *ifp = handle;
5674 	boolean_t pdrop;        /* dummy */
5675 	uint32_t i;
5676 
5677 	ASSERT(n_pkts >= 1);
5678 	for (i = 0; i < n_pkts - 1; i++) {
5679 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5680 		    FALSE, &pdrop);
5681 	}
5682 	/* flush with the last packet */
5683 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5684 	    TRUE, &pdrop);
5685 
5686 	return 0;
5687 }
5688 
5689 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5690 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5691     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5692 {
5693 	if (ifp->if_output_netem != NULL) {
5694 		bool drop;
5695 		errno_t error;
5696 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5697 		*pdrop = drop ? TRUE : FALSE;
5698 		return error;
5699 	} else {
5700 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5701 	}
5702 }
5703 
5704 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5705 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5706 {
5707 	boolean_t pdrop;
5708 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5709 }
5710 
5711 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5712 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5713     boolean_t *pdrop)
5714 {
5715 	classq_pkt_t pkt;
5716 
5717 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5718 	    m->m_nextpkt != NULL) {
5719 		if (m != NULL) {
5720 			m_freem_list(m);
5721 			*pdrop = TRUE;
5722 		}
5723 		return EINVAL;
5724 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5725 	    !IF_FULLY_ATTACHED(ifp)) {
5726 		/* flag tested without lock for performance */
5727 		m_freem(m);
5728 		*pdrop = TRUE;
5729 		return ENXIO;
5730 	} else if (!(ifp->if_flags & IFF_UP)) {
5731 		m_freem(m);
5732 		*pdrop = TRUE;
5733 		return ENETDOWN;
5734 	}
5735 
5736 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5737 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5738 }
5739 
5740 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5741 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5742     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5743     boolean_t *pdrop)
5744 {
5745 	classq_pkt_t head, tail;
5746 
5747 	ASSERT(m_head != NULL);
5748 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5749 	ASSERT(m_tail != NULL);
5750 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5751 	ASSERT(ifp != NULL);
5752 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5753 
5754 	if (!IF_FULLY_ATTACHED(ifp)) {
5755 		/* flag tested without lock for performance */
5756 		m_freem_list(m_head);
5757 		*pdrop = TRUE;
5758 		return ENXIO;
5759 	} else if (!(ifp->if_flags & IFF_UP)) {
5760 		m_freem_list(m_head);
5761 		*pdrop = TRUE;
5762 		return ENETDOWN;
5763 	}
5764 
5765 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5766 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5767 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5768 	           flush, pdrop);
5769 }
5770 
5771 #if SKYWALK
5772 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5773 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5774     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5775 {
5776 	classq_pkt_t pkt;
5777 
5778 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5779 
5780 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5781 		if (kpkt != NULL) {
5782 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5783 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5784 			*pdrop = TRUE;
5785 		}
5786 		return EINVAL;
5787 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5788 	    !IF_FULLY_ATTACHED(ifp))) {
5789 		/* flag tested without lock for performance */
5790 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5791 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5792 		*pdrop = TRUE;
5793 		return ENXIO;
5794 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5795 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5796 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5797 		*pdrop = TRUE;
5798 		return ENETDOWN;
5799 	}
5800 
5801 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5802 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5803 }
5804 
5805 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5806 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5807     boolean_t flush, boolean_t *pdrop)
5808 {
5809 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5810 }
5811 
5812 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5813 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5814     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5815 {
5816 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5817 }
5818 
5819 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5820 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5821     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5822     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5823 {
5824 	classq_pkt_t head, tail;
5825 
5826 	ASSERT(k_head != NULL);
5827 	ASSERT(k_tail != NULL);
5828 	ASSERT(ifp != NULL);
5829 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5830 
5831 	if (!IF_FULLY_ATTACHED(ifp)) {
5832 		/* flag tested without lock for performance */
5833 		pp_free_packet_chain(k_head, NULL);
5834 		*pdrop = TRUE;
5835 		return ENXIO;
5836 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5837 		pp_free_packet_chain(k_head, NULL);
5838 		*pdrop = TRUE;
5839 		return ENETDOWN;
5840 	}
5841 
5842 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5843 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5844 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5845 	           flush, pdrop);
5846 }
5847 
5848 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5849 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5850     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5851     boolean_t *pdrop)
5852 {
5853 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5854 	           cnt, bytes, flush, pdrop);
5855 }
5856 
5857 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5858 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5859     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5860     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5861 {
5862 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5863 	           cnt, bytes, flush, pdrop);
5864 }
5865 #endif /* SKYWALK */
5866 
5867 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5868 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5869 {
5870 	errno_t rc;
5871 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5872 
5873 	if (ifp == NULL || mp == NULL) {
5874 		return EINVAL;
5875 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5876 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5877 		return ENXIO;
5878 	}
5879 	if (!ifnet_is_attached(ifp, 1)) {
5880 		return ENXIO;
5881 	}
5882 
5883 #if SKYWALK
5884 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5885 #endif /* SKYWALK */
5886 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5887 	    &pkt, NULL, NULL, NULL, 0);
5888 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5889 	ifnet_decr_iorefcnt(ifp);
5890 	*mp = pkt.cp_mbuf;
5891 	return rc;
5892 }
5893 
5894 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5895 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5896     struct mbuf **mp)
5897 {
5898 	errno_t rc;
5899 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5900 
5901 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5902 		return EINVAL;
5903 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5904 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5905 		return ENXIO;
5906 	}
5907 	if (!ifnet_is_attached(ifp, 1)) {
5908 		return ENXIO;
5909 	}
5910 
5911 #if SKYWALK
5912 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5913 #endif /* SKYWALK */
5914 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5915 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5916 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5917 	ifnet_decr_iorefcnt(ifp);
5918 	*mp = pkt.cp_mbuf;
5919 	return rc;
5920 }
5921 
5922 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5923 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5924     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5925 {
5926 	errno_t rc;
5927 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5928 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5929 
5930 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5931 		return EINVAL;
5932 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5933 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5934 		return ENXIO;
5935 	}
5936 	if (!ifnet_is_attached(ifp, 1)) {
5937 		return ENXIO;
5938 	}
5939 
5940 #if SKYWALK
5941 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5942 #endif /* SKYWALK */
5943 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5944 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5945 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5946 	ifnet_decr_iorefcnt(ifp);
5947 	*head = pkt_head.cp_mbuf;
5948 	if (tail != NULL) {
5949 		*tail = pkt_tail.cp_mbuf;
5950 	}
5951 	return rc;
5952 }
5953 
5954 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5955 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5956     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5957 {
5958 	errno_t rc;
5959 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5960 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5961 
5962 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5963 		return EINVAL;
5964 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5965 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5966 		return ENXIO;
5967 	}
5968 	if (!ifnet_is_attached(ifp, 1)) {
5969 		return ENXIO;
5970 	}
5971 
5972 #if SKYWALK
5973 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5974 #endif /* SKYWALK */
5975 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5976 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5977 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5978 	ifnet_decr_iorefcnt(ifp);
5979 	*head = pkt_head.cp_mbuf;
5980 	if (tail != NULL) {
5981 		*tail = pkt_tail.cp_mbuf;
5982 	}
5983 	return rc;
5984 }
5985 
5986 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5987 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5988     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5989     u_int32_t *len)
5990 {
5991 	errno_t rc;
5992 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5993 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5994 
5995 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5996 	    !MBUF_VALID_SC(sc)) {
5997 		return EINVAL;
5998 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5999 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6000 		return ENXIO;
6001 	}
6002 	if (!ifnet_is_attached(ifp, 1)) {
6003 		return ENXIO;
6004 	}
6005 
6006 #if SKYWALK
6007 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6008 #endif /* SKYWALK */
6009 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6010 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6011 	    cnt, len, 0);
6012 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6013 	ifnet_decr_iorefcnt(ifp);
6014 	*head = pkt_head.cp_mbuf;
6015 	if (tail != NULL) {
6016 		*tail = pkt_tail.cp_mbuf;
6017 	}
6018 	return rc;
6019 }
6020 
6021 #if XNU_TARGET_OS_OSX
6022 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6023 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6024     const struct sockaddr *dest, const char *dest_linkaddr,
6025     const char *frame_type, u_int32_t *pre, u_int32_t *post)
6026 {
6027 	if (pre != NULL) {
6028 		*pre = 0;
6029 	}
6030 	if (post != NULL) {
6031 		*post = 0;
6032 	}
6033 
6034 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6035 }
6036 #endif /* XNU_TARGET_OS_OSX */
6037 
6038 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6039 packet_has_vlan_tag(struct mbuf * m)
6040 {
6041 	u_int   tag = 0;
6042 
6043 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6044 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6045 		if (tag == 0) {
6046 			/* the packet is just priority-tagged, clear the bit */
6047 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6048 		}
6049 	}
6050 	return tag != 0;
6051 }
6052 
6053 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6054 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6055     char **frame_header_p, protocol_family_t protocol_family)
6056 {
6057 	boolean_t               is_vlan_packet = FALSE;
6058 	struct ifnet_filter     *filter;
6059 	struct mbuf             *m = *m_p;
6060 
6061 	is_vlan_packet = packet_has_vlan_tag(m);
6062 
6063 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6064 		return 0;
6065 	}
6066 
6067 	/*
6068 	 * Pass the inbound packet to the interface filters
6069 	 */
6070 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6071 	/* prevent filter list from changing in case we drop the lock */
6072 	if_flt_monitor_busy(ifp);
6073 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6074 		int result;
6075 
6076 		/* exclude VLAN packets from external filters PR-3586856 */
6077 		if (is_vlan_packet &&
6078 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6079 			continue;
6080 		}
6081 
6082 		if (!filter->filt_skip && filter->filt_input != NULL &&
6083 		    (filter->filt_protocol == 0 ||
6084 		    filter->filt_protocol == protocol_family)) {
6085 			lck_mtx_unlock(&ifp->if_flt_lock);
6086 
6087 			result = (*filter->filt_input)(filter->filt_cookie,
6088 			    ifp, protocol_family, m_p, frame_header_p);
6089 
6090 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6091 			if (result != 0) {
6092 				/* we're done with the filter list */
6093 				if_flt_monitor_unbusy(ifp);
6094 				lck_mtx_unlock(&ifp->if_flt_lock);
6095 				return result;
6096 			}
6097 		}
6098 	}
6099 	/* we're done with the filter list */
6100 	if_flt_monitor_unbusy(ifp);
6101 	lck_mtx_unlock(&ifp->if_flt_lock);
6102 
6103 	/*
6104 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6105 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6106 	 */
6107 	if (*m_p != NULL) {
6108 		(*m_p)->m_flags &= ~M_PROTO1;
6109 	}
6110 
6111 	return 0;
6112 }
6113 
6114 __attribute__((noinline))
6115 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6116 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6117     protocol_family_t protocol_family)
6118 {
6119 	boolean_t               is_vlan_packet;
6120 	struct ifnet_filter     *filter;
6121 	struct mbuf             *m = *m_p;
6122 
6123 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6124 		return 0;
6125 	}
6126 	is_vlan_packet = packet_has_vlan_tag(m);
6127 
6128 	/*
6129 	 * Pass the outbound packet to the interface filters
6130 	 */
6131 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6132 	/* prevent filter list from changing in case we drop the lock */
6133 	if_flt_monitor_busy(ifp);
6134 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6135 		int result;
6136 
6137 		/* exclude VLAN packets from external filters PR-3586856 */
6138 		if (is_vlan_packet &&
6139 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6140 			continue;
6141 		}
6142 
6143 		if (!filter->filt_skip && filter->filt_output != NULL &&
6144 		    (filter->filt_protocol == 0 ||
6145 		    filter->filt_protocol == protocol_family)) {
6146 			lck_mtx_unlock(&ifp->if_flt_lock);
6147 
6148 			result = filter->filt_output(filter->filt_cookie, ifp,
6149 			    protocol_family, m_p);
6150 
6151 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6152 			if (result != 0) {
6153 				/* we're done with the filter list */
6154 				if_flt_monitor_unbusy(ifp);
6155 				lck_mtx_unlock(&ifp->if_flt_lock);
6156 				return result;
6157 			}
6158 		}
6159 	}
6160 	/* we're done with the filter list */
6161 	if_flt_monitor_unbusy(ifp);
6162 	lck_mtx_unlock(&ifp->if_flt_lock);
6163 
6164 	return 0;
6165 }
6166 
6167 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6168 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6169 {
6170 	int error;
6171 
6172 	if (ifproto->proto_kpi == kProtoKPI_v1) {
6173 		/* Version 1 protocols get one packet at a time */
6174 		while (m != NULL) {
6175 			char *  frame_header;
6176 			mbuf_t  next_packet;
6177 
6178 			next_packet = m->m_nextpkt;
6179 			m->m_nextpkt = NULL;
6180 			frame_header = m->m_pkthdr.pkt_hdr;
6181 			m->m_pkthdr.pkt_hdr = NULL;
6182 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6183 			    ifproto->protocol_family, m, frame_header);
6184 			if (error != 0 && error != EJUSTRETURN) {
6185 				m_freem(m);
6186 			}
6187 			m = next_packet;
6188 		}
6189 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
6190 		/* Version 2 protocols support packet lists */
6191 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6192 		    ifproto->protocol_family, m);
6193 		if (error != 0 && error != EJUSTRETURN) {
6194 			m_freem_list(m);
6195 		}
6196 	}
6197 }
6198 
6199 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6200 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6201     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6202 {
6203 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6204 
6205 	if (s->packets_in != 0) {
6206 		d->packets_in += s->packets_in;
6207 	}
6208 	if (s->bytes_in != 0) {
6209 		d->bytes_in += s->bytes_in;
6210 	}
6211 	if (s->errors_in != 0) {
6212 		d->errors_in += s->errors_in;
6213 	}
6214 
6215 	if (s->packets_out != 0) {
6216 		d->packets_out += s->packets_out;
6217 	}
6218 	if (s->bytes_out != 0) {
6219 		d->bytes_out += s->bytes_out;
6220 	}
6221 	if (s->errors_out != 0) {
6222 		d->errors_out += s->errors_out;
6223 	}
6224 
6225 	if (s->collisions != 0) {
6226 		d->collisions += s->collisions;
6227 	}
6228 	if (s->dropped != 0) {
6229 		d->dropped += s->dropped;
6230 	}
6231 
6232 	if (poll) {
6233 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6234 	}
6235 }
6236 
6237 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6238 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6239 {
6240 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6241 
6242 	/*
6243 	 * Use of atomic operations is unavoidable here because
6244 	 * these stats may also be incremented elsewhere via KPIs.
6245 	 */
6246 	if (s->packets_in != 0) {
6247 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6248 		s->packets_in = 0;
6249 	}
6250 	if (s->bytes_in != 0) {
6251 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6252 		s->bytes_in = 0;
6253 	}
6254 	if (s->errors_in != 0) {
6255 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6256 		s->errors_in = 0;
6257 	}
6258 
6259 	if (s->packets_out != 0) {
6260 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6261 		s->packets_out = 0;
6262 	}
6263 	if (s->bytes_out != 0) {
6264 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6265 		s->bytes_out = 0;
6266 	}
6267 	if (s->errors_out != 0) {
6268 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6269 		s->errors_out = 0;
6270 	}
6271 
6272 	if (s->collisions != 0) {
6273 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6274 		s->collisions = 0;
6275 	}
6276 	if (s->dropped != 0) {
6277 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6278 		s->dropped = 0;
6279 	}
6280 
6281 	/*
6282 	 * No need for atomic operations as they are modified here
6283 	 * only from within the DLIL input thread context.
6284 	 */
6285 	if (ifp->if_poll_tstats.packets != 0) {
6286 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6287 		ifp->if_poll_tstats.packets = 0;
6288 	}
6289 	if (ifp->if_poll_tstats.bytes != 0) {
6290 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6291 		ifp->if_poll_tstats.bytes = 0;
6292 	}
6293 
6294 	return ifp->if_data_threshold != 0;
6295 }
6296 
6297 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6298 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6299 {
6300 	return dlil_input_packet_list_common(ifp, m, 0,
6301 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6302 }
6303 
6304 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6305 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6306     u_int32_t cnt, ifnet_model_t mode)
6307 {
6308 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6309 }
6310 
6311 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6312 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6313     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6314 {
6315 	int error = 0;
6316 	protocol_family_t protocol_family;
6317 	mbuf_t next_packet;
6318 	ifnet_t ifp = ifp_param;
6319 	char *frame_header = NULL;
6320 	struct if_proto *last_ifproto = NULL;
6321 	mbuf_t pkt_first = NULL;
6322 	mbuf_t *pkt_next = NULL;
6323 	u_int32_t poll_thresh = 0, poll_ival = 0;
6324 	int iorefcnt = 0;
6325 
6326 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6327 
6328 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6329 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6330 		poll_thresh = cnt;
6331 	}
6332 
6333 	while (m != NULL) {
6334 		struct if_proto *ifproto = NULL;
6335 		uint32_t pktf_mask;     /* pkt flags to preserve */
6336 
6337 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6338 
6339 		if (ifp_param == NULL) {
6340 			ifp = m->m_pkthdr.rcvif;
6341 		}
6342 
6343 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6344 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6345 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6346 			ifnet_poll(ifp);
6347 		}
6348 
6349 		/* Check if this mbuf looks valid */
6350 		MBUF_INPUT_CHECK(m, ifp);
6351 
6352 		next_packet = m->m_nextpkt;
6353 		m->m_nextpkt = NULL;
6354 		frame_header = m->m_pkthdr.pkt_hdr;
6355 		m->m_pkthdr.pkt_hdr = NULL;
6356 
6357 		/*
6358 		 * Get an IO reference count if the interface is not
6359 		 * loopback (lo0) and it is attached; lo0 never goes
6360 		 * away, so optimize for that.
6361 		 */
6362 		if (ifp != lo_ifp) {
6363 			/* iorefcnt is 0 if it hasn't been taken yet */
6364 			if (iorefcnt == 0) {
6365 				if (!ifnet_datamov_begin(ifp)) {
6366 					m_freem(m);
6367 					goto next;
6368 				}
6369 			}
6370 			iorefcnt = 1;
6371 			/*
6372 			 * Preserve the time stamp and skip pktap flags.
6373 			 */
6374 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6375 		} else {
6376 			/*
6377 			 * If this arrived on lo0, preserve interface addr
6378 			 * info to allow for connectivity between loopback
6379 			 * and local interface addresses.
6380 			 */
6381 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6382 		}
6383 		pktf_mask |= PKTF_WAKE_PKT;
6384 
6385 		/* make sure packet comes in clean */
6386 		m_classifier_init(m, pktf_mask);
6387 
6388 		ifp_inc_traffic_class_in(ifp, m);
6389 
6390 		/* find which protocol family this packet is for */
6391 		ifnet_lock_shared(ifp);
6392 		error = (*ifp->if_demux)(ifp, m, frame_header,
6393 		    &protocol_family);
6394 		ifnet_lock_done(ifp);
6395 		if (error != 0) {
6396 			if (error == EJUSTRETURN) {
6397 				goto next;
6398 			}
6399 			protocol_family = 0;
6400 		}
6401 
6402 #if (DEVELOPMENT || DEBUG)
6403 		/*
6404 		 * For testing we do not care about broadcast and multicast packets as
6405 		 * they are not as controllable as unicast traffic
6406 		 */
6407 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6408 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6409 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6410 				/*
6411 				 * This is a one-shot command
6412 				 */
6413 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6414 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6415 			}
6416 		}
6417 #endif /* (DEVELOPMENT || DEBUG) */
6418 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6419 			char buffer[64];
6420 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6421 
6422 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6423 			    ifp->if_xname, m_pktlen(m));
6424 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6425 				log_hexdump(buffer, buflen);
6426 			}
6427 		}
6428 
6429 		pktap_input(ifp, protocol_family, m, frame_header);
6430 
6431 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6432 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6433 		    ifp->if_type == IFT_CELLULAR) {
6434 			m_freem(m);
6435 			ip6stat.ip6s_clat464_in_v4_drop++;
6436 			goto next;
6437 		}
6438 
6439 		/* Translate the packet if it is received on CLAT interface */
6440 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6441 		    && dlil_is_clat_needed(protocol_family, m)) {
6442 			char *data = NULL;
6443 			struct ether_header eh;
6444 			struct ether_header *ehp = NULL;
6445 
6446 			if (ifp->if_type == IFT_ETHER) {
6447 				ehp = (struct ether_header *)(void *)frame_header;
6448 				/* Skip RX Ethernet packets if they are not IPV6 */
6449 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6450 					goto skip_clat;
6451 				}
6452 
6453 				/* Keep a copy of frame_header for Ethernet packets */
6454 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6455 			}
6456 			error = dlil_clat64(ifp, &protocol_family, &m);
6457 			data = (char *) mbuf_data(m);
6458 			if (error != 0) {
6459 				m_freem(m);
6460 				ip6stat.ip6s_clat464_in_drop++;
6461 				goto next;
6462 			}
6463 			/* Native v6 should be No-op */
6464 			if (protocol_family != PF_INET) {
6465 				goto skip_clat;
6466 			}
6467 
6468 			/* Do this only for translated v4 packets. */
6469 			switch (ifp->if_type) {
6470 			case IFT_CELLULAR:
6471 				frame_header = data;
6472 				break;
6473 			case IFT_ETHER:
6474 				/*
6475 				 * Drop if the mbuf doesn't have enough
6476 				 * space for Ethernet header
6477 				 */
6478 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6479 					m_free(m);
6480 					ip6stat.ip6s_clat464_in_drop++;
6481 					goto next;
6482 				}
6483 				/*
6484 				 * Set the frame_header ETHER_HDR_LEN bytes
6485 				 * preceeding the data pointer. Change
6486 				 * the ether_type too.
6487 				 */
6488 				frame_header = data - ETHER_HDR_LEN;
6489 				eh.ether_type = htons(ETHERTYPE_IP);
6490 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6491 				break;
6492 			}
6493 		}
6494 skip_clat:
6495 		/*
6496 		 * Match the wake packet against the list of ports that has been
6497 		 * been queried by the driver before the device went to sleep
6498 		 */
6499 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6500 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6501 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6502 			}
6503 		}
6504 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6505 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6506 			dlil_input_cksum_dbg(ifp, m, frame_header,
6507 			    protocol_family);
6508 		}
6509 		/*
6510 		 * For partial checksum offload, we expect the driver to
6511 		 * set the start offset indicating the start of the span
6512 		 * that is covered by the hardware-computed checksum;
6513 		 * adjust this start offset accordingly because the data
6514 		 * pointer has been advanced beyond the link-layer header.
6515 		 *
6516 		 * Virtual lan types (bridge, vlan, bond) can call
6517 		 * dlil_input_packet_list() with the same packet with the
6518 		 * checksum flags set. Set a flag indicating that the
6519 		 * adjustment has already been done.
6520 		 */
6521 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6522 			/* adjustment has already been done */
6523 		} else if ((m->m_pkthdr.csum_flags &
6524 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6525 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6526 			int adj;
6527 			if (frame_header == NULL ||
6528 			    frame_header < (char *)mbuf_datastart(m) ||
6529 			    frame_header > (char *)m->m_data ||
6530 			    (adj = (int)(m->m_data - frame_header)) >
6531 			    m->m_pkthdr.csum_rx_start) {
6532 				m->m_pkthdr.csum_data = 0;
6533 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6534 				hwcksum_in_invalidated++;
6535 			} else {
6536 				m->m_pkthdr.csum_rx_start -= adj;
6537 			}
6538 			/* make sure we don't adjust more than once */
6539 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6540 		}
6541 		if (clat_debug) {
6542 			pktap_input(ifp, protocol_family, m, frame_header);
6543 		}
6544 
6545 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6546 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6547 		}
6548 
6549 		/* run interface filters */
6550 		error = dlil_interface_filters_input(ifp, &m,
6551 		    &frame_header, protocol_family);
6552 		if (error != 0) {
6553 			if (error != EJUSTRETURN) {
6554 				m_freem(m);
6555 			}
6556 			goto next;
6557 		}
6558 		/*
6559 		 * A VLAN interface receives VLAN-tagged packets by attaching
6560 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6561 		 * interface is a member of a bridge, the parent interface
6562 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6563 		 * M_PROMISC packet must be processed by the VLAN protocol
6564 		 * so that it can be sent up the stack via
6565 		 * dlil_input_packet_list(). That allows the bridge interface's
6566 		 * input filter, attached to the VLAN interface, to process
6567 		 * the packet.
6568 		 */
6569 		if (protocol_family != PF_VLAN &&
6570 		    (m->m_flags & M_PROMISC) != 0) {
6571 			m_freem(m);
6572 			goto next;
6573 		}
6574 
6575 		/* Lookup the protocol attachment to this interface */
6576 		if (protocol_family == 0) {
6577 			ifproto = NULL;
6578 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6579 		    (last_ifproto->protocol_family == protocol_family)) {
6580 			VERIFY(ifproto == NULL);
6581 			ifproto = last_ifproto;
6582 			if_proto_ref(last_ifproto);
6583 		} else {
6584 			VERIFY(ifproto == NULL);
6585 			ifnet_lock_shared(ifp);
6586 			/* callee holds a proto refcnt upon success */
6587 			ifproto = find_attached_proto(ifp, protocol_family);
6588 			ifnet_lock_done(ifp);
6589 		}
6590 		if (ifproto == NULL) {
6591 			/* no protocol for this packet, discard */
6592 			m_freem(m);
6593 			goto next;
6594 		}
6595 		if (ifproto != last_ifproto) {
6596 			if (last_ifproto != NULL) {
6597 				/* pass up the list for the previous protocol */
6598 				dlil_ifproto_input(last_ifproto, pkt_first);
6599 				pkt_first = NULL;
6600 				if_proto_free(last_ifproto);
6601 			}
6602 			last_ifproto = ifproto;
6603 			if_proto_ref(ifproto);
6604 		}
6605 		/* extend the list */
6606 		m->m_pkthdr.pkt_hdr = frame_header;
6607 		if (pkt_first == NULL) {
6608 			pkt_first = m;
6609 		} else {
6610 			*pkt_next = m;
6611 		}
6612 		pkt_next = &m->m_nextpkt;
6613 
6614 next:
6615 		if (next_packet == NULL && last_ifproto != NULL) {
6616 			/* pass up the last list of packets */
6617 			dlil_ifproto_input(last_ifproto, pkt_first);
6618 			if_proto_free(last_ifproto);
6619 			last_ifproto = NULL;
6620 		}
6621 		if (ifproto != NULL) {
6622 			if_proto_free(ifproto);
6623 			ifproto = NULL;
6624 		}
6625 
6626 		m = next_packet;
6627 
6628 		/* update the driver's multicast filter, if needed */
6629 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6630 			ifp->if_updatemcasts = 0;
6631 		}
6632 		if (iorefcnt == 1) {
6633 			/* If the next mbuf is on a different interface, unlock data-mov */
6634 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6635 				ifnet_datamov_end(ifp);
6636 				iorefcnt = 0;
6637 			}
6638 		}
6639 	}
6640 
6641 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6642 }
6643 
6644 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6645 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6646 {
6647 	errno_t err;
6648 
6649 	if (sync) {
6650 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6651 		if (err == EAFNOSUPPORT) {
6652 			err = 0;
6653 		}
6654 	} else {
6655 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6656 		err = 0;
6657 	}
6658 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6659 	    "(err=%d)\n", if_name(ifp),
6660 	    (err == 0 ? "successfully restored" : "failed to restore"),
6661 	    ifp->if_updatemcasts, err);
6662 
6663 	/* just return success */
6664 	return 0;
6665 }
6666 
6667 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6668 if_mcasts_update_async(struct ifnet *ifp)
6669 {
6670 	return if_mcasts_update_common(ifp, false);
6671 }
6672 
6673 errno_t
if_mcasts_update(struct ifnet * ifp)6674 if_mcasts_update(struct ifnet *ifp)
6675 {
6676 	return if_mcasts_update_common(ifp, true);
6677 }
6678 
6679 /* If ifp is set, we will increment the generation for the interface */
6680 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6681 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6682 {
6683 	if (ifp != NULL) {
6684 		ifnet_increment_generation(ifp);
6685 	}
6686 
6687 #if NECP
6688 	necp_update_all_clients();
6689 #endif /* NECP */
6690 
6691 	return kev_post_msg(event);
6692 }
6693 
6694 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6695 dlil_post_sifflags_msg(struct ifnet * ifp)
6696 {
6697 	struct kev_msg ev_msg;
6698 	struct net_event_data ev_data;
6699 
6700 	bzero(&ev_data, sizeof(ev_data));
6701 	bzero(&ev_msg, sizeof(ev_msg));
6702 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6703 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6704 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6705 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6706 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6707 	ev_data.if_family = ifp->if_family;
6708 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6709 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6710 	ev_msg.dv[0].data_ptr = &ev_data;
6711 	ev_msg.dv[1].data_length = 0;
6712 	dlil_post_complete_msg(ifp, &ev_msg);
6713 }
6714 
6715 #define TMP_IF_PROTO_ARR_SIZE   10
6716 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6717 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6718 {
6719 	struct ifnet_filter *filter = NULL;
6720 	struct if_proto *proto = NULL;
6721 	int if_proto_count = 0;
6722 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6723 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6724 	int tmp_ifproto_arr_idx = 0;
6725 
6726 	/*
6727 	 * Pass the event to the interface filters
6728 	 */
6729 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6730 	/* prevent filter list from changing in case we drop the lock */
6731 	if_flt_monitor_busy(ifp);
6732 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6733 		if (filter->filt_event != NULL) {
6734 			lck_mtx_unlock(&ifp->if_flt_lock);
6735 
6736 			filter->filt_event(filter->filt_cookie, ifp,
6737 			    filter->filt_protocol, event);
6738 
6739 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6740 		}
6741 	}
6742 	/* we're done with the filter list */
6743 	if_flt_monitor_unbusy(ifp);
6744 	lck_mtx_unlock(&ifp->if_flt_lock);
6745 
6746 	/* Get an io ref count if the interface is attached */
6747 	if (!ifnet_is_attached(ifp, 1)) {
6748 		goto done;
6749 	}
6750 
6751 	/*
6752 	 * An embedded tmp_list_entry in if_proto may still get
6753 	 * over-written by another thread after giving up ifnet lock,
6754 	 * therefore we are avoiding embedded pointers here.
6755 	 */
6756 	ifnet_lock_shared(ifp);
6757 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6758 	if (if_proto_count) {
6759 		int i;
6760 		VERIFY(ifp->if_proto_hash != NULL);
6761 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6762 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6763 		} else {
6764 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6765 			    if_proto_count, Z_WAITOK | Z_ZERO);
6766 			if (tmp_ifproto_arr == NULL) {
6767 				ifnet_lock_done(ifp);
6768 				goto cleanup;
6769 			}
6770 		}
6771 
6772 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6773 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6774 			    next_hash) {
6775 				if_proto_ref(proto);
6776 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6777 				tmp_ifproto_arr_idx++;
6778 			}
6779 		}
6780 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6781 	}
6782 	ifnet_lock_done(ifp);
6783 
6784 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6785 	    tmp_ifproto_arr_idx++) {
6786 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6787 		VERIFY(proto != NULL);
6788 		proto_media_event eventp =
6789 		    (proto->proto_kpi == kProtoKPI_v1 ?
6790 		    proto->kpi.v1.event :
6791 		    proto->kpi.v2.event);
6792 
6793 		if (eventp != NULL) {
6794 			eventp(ifp, proto->protocol_family,
6795 			    event);
6796 		}
6797 		if_proto_free(proto);
6798 	}
6799 
6800 cleanup:
6801 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6802 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6803 	}
6804 
6805 	/* Pass the event to the interface */
6806 	if (ifp->if_event != NULL) {
6807 		ifp->if_event(ifp, event);
6808 	}
6809 
6810 	/* Release the io ref count */
6811 	ifnet_decr_iorefcnt(ifp);
6812 done:
6813 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6814 }
6815 
6816 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6817 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6818 {
6819 	struct kev_msg kev_msg;
6820 	int result = 0;
6821 
6822 	if (ifp == NULL || event == NULL) {
6823 		return EINVAL;
6824 	}
6825 
6826 	bzero(&kev_msg, sizeof(kev_msg));
6827 	kev_msg.vendor_code = event->vendor_code;
6828 	kev_msg.kev_class = event->kev_class;
6829 	kev_msg.kev_subclass = event->kev_subclass;
6830 	kev_msg.event_code = event->event_code;
6831 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6832 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6833 	kev_msg.dv[1].data_length = 0;
6834 
6835 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6836 
6837 	return result;
6838 }
6839 
6840 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6841 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6842 {
6843 	mbuf_t  n = m;
6844 	int chainlen = 0;
6845 
6846 	while (n != NULL) {
6847 		chainlen++;
6848 		n = n->m_next;
6849 	}
6850 	switch (chainlen) {
6851 	case 0:
6852 		break;
6853 	case 1:
6854 		os_atomic_inc(&cls->cls_one, relaxed);
6855 		break;
6856 	case 2:
6857 		os_atomic_inc(&cls->cls_two, relaxed);
6858 		break;
6859 	case 3:
6860 		os_atomic_inc(&cls->cls_three, relaxed);
6861 		break;
6862 	case 4:
6863 		os_atomic_inc(&cls->cls_four, relaxed);
6864 		break;
6865 	case 5:
6866 	default:
6867 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6868 		break;
6869 	}
6870 }
6871 
6872 #if CONFIG_DTRACE
6873 __attribute__((noinline))
6874 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6875 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6876 {
6877 	if (proto_family == PF_INET) {
6878 		struct ip *ip = mtod(m, struct ip *);
6879 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6880 		    struct ip *, ip, struct ifnet *, ifp,
6881 		    struct ip *, ip, struct ip6_hdr *, NULL);
6882 	} else if (proto_family == PF_INET6) {
6883 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6884 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6885 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6886 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6887 	}
6888 }
6889 #endif /* CONFIG_DTRACE */
6890 
6891 /*
6892  * dlil_output
6893  *
6894  * Caller should have a lock on the protocol domain if the protocol
6895  * doesn't support finer grained locking. In most cases, the lock
6896  * will be held from the socket layer and won't be released until
6897  * we return back to the socket layer.
6898  *
6899  * This does mean that we must take a protocol lock before we take
6900  * an interface lock if we're going to take both. This makes sense
6901  * because a protocol is likely to interact with an ifp while it
6902  * is under the protocol lock.
6903  *
6904  * An advisory code will be returned if adv is not null. This
6905  * can be used to provide feedback about interface queues to the
6906  * application.
6907  */
6908 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6909 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6910     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6911 {
6912 	char *frame_type = NULL;
6913 	char *dst_linkaddr = NULL;
6914 	int retval = 0;
6915 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6916 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6917 	struct if_proto *proto = NULL;
6918 	mbuf_t  m = NULL;
6919 	mbuf_t  send_head = NULL;
6920 	mbuf_t  *send_tail = &send_head;
6921 	int iorefcnt = 0;
6922 	u_int32_t pre = 0, post = 0;
6923 	u_int32_t fpkts = 0, fbytes = 0;
6924 	int32_t flen = 0;
6925 	struct timespec now;
6926 	u_int64_t now_nsec;
6927 	boolean_t did_clat46 = FALSE;
6928 	protocol_family_t old_proto_family = proto_family;
6929 	struct sockaddr_in6 dest6;
6930 	struct rtentry *rt = NULL;
6931 	u_int32_t m_loop_set = 0;
6932 
6933 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6934 
6935 	/*
6936 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6937 	 * from happening while this operation is in progress
6938 	 */
6939 	if (!ifnet_datamov_begin(ifp)) {
6940 		retval = ENXIO;
6941 		goto cleanup;
6942 	}
6943 	iorefcnt = 1;
6944 
6945 	VERIFY(ifp->if_output_dlil != NULL);
6946 
6947 	/* update the driver's multicast filter, if needed */
6948 	if (ifp->if_updatemcasts > 0) {
6949 		if_mcasts_update_async(ifp);
6950 		ifp->if_updatemcasts = 0;
6951 	}
6952 
6953 	frame_type = frame_type_buffer;
6954 	dst_linkaddr = dst_linkaddr_buffer;
6955 
6956 	if (raw == 0) {
6957 		ifnet_lock_shared(ifp);
6958 		/* callee holds a proto refcnt upon success */
6959 		proto = find_attached_proto(ifp, proto_family);
6960 		if (proto == NULL) {
6961 			ifnet_lock_done(ifp);
6962 			retval = ENXIO;
6963 			goto cleanup;
6964 		}
6965 		ifnet_lock_done(ifp);
6966 	}
6967 
6968 preout_again:
6969 	if (packetlist == NULL) {
6970 		goto cleanup;
6971 	}
6972 
6973 	m = packetlist;
6974 	packetlist = packetlist->m_nextpkt;
6975 	m->m_nextpkt = NULL;
6976 
6977 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6978 
6979 	/*
6980 	 * Perform address family translation for the first
6981 	 * packet outside the loop in order to perform address
6982 	 * lookup for the translated proto family.
6983 	 */
6984 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6985 	    (ifp->if_type == IFT_CELLULAR ||
6986 	    dlil_is_clat_needed(proto_family, m))) {
6987 		retval = dlil_clat46(ifp, &proto_family, &m);
6988 		/*
6989 		 * Go to the next packet if translation fails
6990 		 */
6991 		if (retval != 0) {
6992 			m_freem(m);
6993 			m = NULL;
6994 			ip6stat.ip6s_clat464_out_drop++;
6995 			/* Make sure that the proto family is PF_INET */
6996 			ASSERT(proto_family == PF_INET);
6997 			goto preout_again;
6998 		}
6999 		/*
7000 		 * Free the old one and make it point to the IPv6 proto structure.
7001 		 *
7002 		 * Change proto for the first time we have successfully
7003 		 * performed address family translation.
7004 		 */
7005 		if (!did_clat46 && proto_family == PF_INET6) {
7006 			did_clat46 = TRUE;
7007 
7008 			if (proto != NULL) {
7009 				if_proto_free(proto);
7010 			}
7011 			ifnet_lock_shared(ifp);
7012 			/* callee holds a proto refcnt upon success */
7013 			proto = find_attached_proto(ifp, proto_family);
7014 			if (proto == NULL) {
7015 				ifnet_lock_done(ifp);
7016 				retval = ENXIO;
7017 				m_freem(m);
7018 				m = NULL;
7019 				goto cleanup;
7020 			}
7021 			ifnet_lock_done(ifp);
7022 			if (ifp->if_type == IFT_ETHER) {
7023 				/* Update the dest to translated v6 address */
7024 				dest6.sin6_len = sizeof(struct sockaddr_in6);
7025 				dest6.sin6_family = AF_INET6;
7026 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7027 				dest = (const struct sockaddr *)&dest6;
7028 
7029 				/*
7030 				 * Lookup route to the translated destination
7031 				 * Free this route ref during cleanup
7032 				 */
7033 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7034 				    0, 0, ifp->if_index);
7035 
7036 				route = rt;
7037 			}
7038 		}
7039 	}
7040 
7041 	/*
7042 	 * This path gets packet chain going to the same destination.
7043 	 * The pre output routine is used to either trigger resolution of
7044 	 * the next hop or retreive the next hop's link layer addressing.
7045 	 * For ex: ether_inet(6)_pre_output routine.
7046 	 *
7047 	 * If the routine returns EJUSTRETURN, it implies that packet has
7048 	 * been queued, and therefore we have to call preout_again for the
7049 	 * following packet in the chain.
7050 	 *
7051 	 * For errors other than EJUSTRETURN, the current packet is freed
7052 	 * and the rest of the chain (pointed by packetlist is freed as
7053 	 * part of clean up.
7054 	 *
7055 	 * Else if there is no error the retrieved information is used for
7056 	 * all the packets in the chain.
7057 	 */
7058 	if (raw == 0) {
7059 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7060 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7061 		retval = 0;
7062 		if (preoutp != NULL) {
7063 			retval = preoutp(ifp, proto_family, &m, dest, route,
7064 			    frame_type, dst_linkaddr);
7065 
7066 			if (retval != 0) {
7067 				if (retval == EJUSTRETURN) {
7068 					goto preout_again;
7069 				}
7070 				m_freem(m);
7071 				m = NULL;
7072 				goto cleanup;
7073 			}
7074 		}
7075 	}
7076 
7077 	do {
7078 		/*
7079 		 * pkt_hdr is set here to point to m_data prior to
7080 		 * calling into the framer. This value of pkt_hdr is
7081 		 * used by the netif gso logic to retrieve the ip header
7082 		 * for the TCP packets, offloaded for TSO processing.
7083 		 */
7084 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7085 			uint8_t vlan_encap_len = 0;
7086 
7087 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7088 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7089 			}
7090 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7091 		} else {
7092 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
7093 		}
7094 
7095 		/*
7096 		 * Perform address family translation if needed.
7097 		 * For now we only support stateless 4 to 6 translation
7098 		 * on the out path.
7099 		 *
7100 		 * The routine below translates IP header, updates protocol
7101 		 * checksum and also translates ICMP.
7102 		 *
7103 		 * We skip the first packet as it is already translated and
7104 		 * the proto family is set to PF_INET6.
7105 		 */
7106 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7107 		    (ifp->if_type == IFT_CELLULAR ||
7108 		    dlil_is_clat_needed(proto_family, m))) {
7109 			retval = dlil_clat46(ifp, &proto_family, &m);
7110 			/* Goto the next packet if the translation fails */
7111 			if (retval != 0) {
7112 				m_freem(m);
7113 				m = NULL;
7114 				ip6stat.ip6s_clat464_out_drop++;
7115 				goto next;
7116 			}
7117 		}
7118 
7119 #if CONFIG_DTRACE
7120 		if (!raw) {
7121 			dlil_output_dtrace(ifp, proto_family, m);
7122 		}
7123 #endif /* CONFIG_DTRACE */
7124 
7125 		if (raw == 0 && ifp->if_framer != NULL) {
7126 			int rcvif_set = 0;
7127 
7128 			/*
7129 			 * If this is a broadcast packet that needs to be
7130 			 * looped back into the system, set the inbound ifp
7131 			 * to that of the outbound ifp.  This will allow
7132 			 * us to determine that it is a legitimate packet
7133 			 * for the system.  Only set the ifp if it's not
7134 			 * already set, just to be safe.
7135 			 */
7136 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7137 			    m->m_pkthdr.rcvif == NULL) {
7138 				m->m_pkthdr.rcvif = ifp;
7139 				rcvif_set = 1;
7140 			}
7141 			m_loop_set = m->m_flags & M_LOOP;
7142 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7143 			    frame_type, &pre, &post);
7144 			if (retval != 0) {
7145 				if (retval != EJUSTRETURN) {
7146 					m_freem(m);
7147 				}
7148 				goto next;
7149 			}
7150 
7151 			/*
7152 			 * For partial checksum offload, adjust the start
7153 			 * and stuff offsets based on the prepended header.
7154 			 */
7155 			if ((m->m_pkthdr.csum_flags &
7156 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7157 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7158 				m->m_pkthdr.csum_tx_stuff += pre;
7159 				m->m_pkthdr.csum_tx_start += pre;
7160 			}
7161 
7162 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7163 				dlil_output_cksum_dbg(ifp, m, pre,
7164 				    proto_family);
7165 			}
7166 
7167 			/*
7168 			 * Clear the ifp if it was set above, and to be
7169 			 * safe, only if it is still the same as the
7170 			 * outbound ifp we have in context.  If it was
7171 			 * looped back, then a copy of it was sent to the
7172 			 * loopback interface with the rcvif set, and we
7173 			 * are clearing the one that will go down to the
7174 			 * layer below.
7175 			 */
7176 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7177 				m->m_pkthdr.rcvif = NULL;
7178 			}
7179 		}
7180 
7181 		/*
7182 		 * Let interface filters (if any) do their thing ...
7183 		 */
7184 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
7185 		if (retval != 0) {
7186 			if (retval != EJUSTRETURN) {
7187 				m_freem(m);
7188 			}
7189 			goto next;
7190 		}
7191 		/*
7192 		 * Strip away M_PROTO1 bit prior to sending packet
7193 		 * to the driver as this field may be used by the driver
7194 		 */
7195 		m->m_flags &= ~M_PROTO1;
7196 
7197 		/*
7198 		 * If the underlying interface is not capable of handling a
7199 		 * packet whose data portion spans across physically disjoint
7200 		 * pages, we need to "normalize" the packet so that we pass
7201 		 * down a chain of mbufs where each mbuf points to a span that
7202 		 * resides in the system page boundary.  If the packet does
7203 		 * not cross page(s), the following is a no-op.
7204 		 */
7205 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7206 			if ((m = m_normalize(m)) == NULL) {
7207 				goto next;
7208 			}
7209 		}
7210 
7211 		/*
7212 		 * If this is a TSO packet, make sure the interface still
7213 		 * advertise TSO capability.
7214 		 */
7215 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7216 			retval = EMSGSIZE;
7217 			m_freem(m);
7218 			goto cleanup;
7219 		}
7220 
7221 		ifp_inc_traffic_class_out(ifp, m);
7222 
7223 #if SKYWALK
7224 		/*
7225 		 * For native skywalk devices, packets will be passed to pktap
7226 		 * after GSO or after the mbuf to packet conversion.
7227 		 * This is done for IPv4/IPv6 packets only because there is no
7228 		 * space in the mbuf to pass down the proto family.
7229 		 */
7230 		if (dlil_is_native_netif_nexus(ifp)) {
7231 			if (raw || m->m_pkthdr.pkt_proto == 0) {
7232 				pktap_output(ifp, proto_family, m, pre, post);
7233 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7234 			}
7235 		} else {
7236 			pktap_output(ifp, proto_family, m, pre, post);
7237 		}
7238 #else /* SKYWALK */
7239 		pktap_output(ifp, proto_family, m, pre, post);
7240 #endif /* SKYWALK */
7241 
7242 		/*
7243 		 * Count the number of elements in the mbuf chain
7244 		 */
7245 		if (tx_chain_len_count) {
7246 			dlil_count_chain_len(m, &tx_chain_len_stats);
7247 		}
7248 
7249 		/*
7250 		 * Record timestamp; ifnet_enqueue() will use this info
7251 		 * rather than redoing the work.  An optimization could
7252 		 * involve doing this just once at the top, if there are
7253 		 * no interface filters attached, but that's probably
7254 		 * not a big deal.
7255 		 */
7256 		nanouptime(&now);
7257 		net_timernsec(&now, &now_nsec);
7258 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
7259 
7260 		/*
7261 		 * Discard partial sum information if this packet originated
7262 		 * from another interface; the packet would already have the
7263 		 * final checksum and we shouldn't recompute it.
7264 		 */
7265 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7266 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7267 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7268 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7269 			m->m_pkthdr.csum_data = 0;
7270 		}
7271 
7272 		/*
7273 		 * Finally, call the driver.
7274 		 */
7275 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7276 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7277 				flen += (m_pktlen(m) - (pre + post));
7278 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7279 			}
7280 			*send_tail = m;
7281 			send_tail = &m->m_nextpkt;
7282 		} else {
7283 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7284 				flen = (m_pktlen(m) - (pre + post));
7285 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7286 			} else {
7287 				flen = 0;
7288 			}
7289 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7290 			    0, 0, 0, 0, 0);
7291 			retval = (*ifp->if_output_dlil)(ifp, m);
7292 			if (retval == EQFULL || retval == EQSUSPENDED) {
7293 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7294 					adv->code = (retval == EQFULL ?
7295 					    FADV_FLOW_CONTROLLED :
7296 					    FADV_SUSPENDED);
7297 				}
7298 				retval = 0;
7299 			}
7300 			if (retval == 0 && flen > 0) {
7301 				fbytes += flen;
7302 				fpkts++;
7303 			}
7304 			if (retval != 0 && dlil_verbose) {
7305 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7306 				    __func__, if_name(ifp),
7307 				    retval);
7308 			}
7309 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7310 			    0, 0, 0, 0, 0);
7311 		}
7312 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7313 
7314 next:
7315 		m = packetlist;
7316 		if (m != NULL) {
7317 			m->m_flags |= m_loop_set;
7318 			packetlist = packetlist->m_nextpkt;
7319 			m->m_nextpkt = NULL;
7320 		}
7321 		/* Reset the proto family to old proto family for CLAT */
7322 		if (did_clat46) {
7323 			proto_family = old_proto_family;
7324 		}
7325 	} while (m != NULL);
7326 
7327 	if (send_head != NULL) {
7328 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7329 		    0, 0, 0, 0, 0);
7330 		if (ifp->if_eflags & IFEF_SENDLIST) {
7331 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7332 			if (retval == EQFULL || retval == EQSUSPENDED) {
7333 				if (adv != NULL) {
7334 					adv->code = (retval == EQFULL ?
7335 					    FADV_FLOW_CONTROLLED :
7336 					    FADV_SUSPENDED);
7337 				}
7338 				retval = 0;
7339 			}
7340 			if (retval == 0 && flen > 0) {
7341 				fbytes += flen;
7342 				fpkts++;
7343 			}
7344 			if (retval != 0 && dlil_verbose) {
7345 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7346 				    __func__, if_name(ifp), retval);
7347 			}
7348 		} else {
7349 			struct mbuf *send_m;
7350 			int enq_cnt = 0;
7351 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7352 			while (send_head != NULL) {
7353 				send_m = send_head;
7354 				send_head = send_m->m_nextpkt;
7355 				send_m->m_nextpkt = NULL;
7356 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7357 				if (retval == EQFULL || retval == EQSUSPENDED) {
7358 					if (adv != NULL) {
7359 						adv->code = (retval == EQFULL ?
7360 						    FADV_FLOW_CONTROLLED :
7361 						    FADV_SUSPENDED);
7362 					}
7363 					retval = 0;
7364 				}
7365 				if (retval == 0) {
7366 					enq_cnt++;
7367 					if (flen > 0) {
7368 						fpkts++;
7369 					}
7370 				}
7371 				if (retval != 0 && dlil_verbose) {
7372 					DLIL_PRINTF("%s: output error on %s "
7373 					    "retval = %d\n",
7374 					    __func__, if_name(ifp), retval);
7375 				}
7376 			}
7377 			if (enq_cnt > 0) {
7378 				fbytes += flen;
7379 				ifnet_start(ifp);
7380 			}
7381 		}
7382 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7383 	}
7384 
7385 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7386 
7387 cleanup:
7388 	if (fbytes > 0) {
7389 		ifp->if_fbytes += fbytes;
7390 	}
7391 	if (fpkts > 0) {
7392 		ifp->if_fpackets += fpkts;
7393 	}
7394 	if (proto != NULL) {
7395 		if_proto_free(proto);
7396 	}
7397 	if (packetlist) { /* if any packets are left, clean up */
7398 		mbuf_freem_list(packetlist);
7399 	}
7400 	if (retval == EJUSTRETURN) {
7401 		retval = 0;
7402 	}
7403 	if (iorefcnt == 1) {
7404 		ifnet_datamov_end(ifp);
7405 	}
7406 	if (rt != NULL) {
7407 		rtfree(rt);
7408 		rt = NULL;
7409 	}
7410 
7411 	return retval;
7412 }
7413 
7414 /*
7415  * This routine checks if the destination address is not a loopback, link-local,
7416  * multicast or broadcast address.
7417  */
7418 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7419 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7420 {
7421 	int ret = 0;
7422 	switch (proto_family) {
7423 	case PF_INET: {
7424 		struct ip *iph = mtod(m, struct ip *);
7425 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7426 			ret = 1;
7427 		}
7428 		break;
7429 	}
7430 	case PF_INET6: {
7431 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7432 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7433 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7434 			ret = 1;
7435 		}
7436 		break;
7437 	}
7438 	}
7439 
7440 	return ret;
7441 }
7442 /*
7443  * @brief This routine translates IPv4 packet to IPv6 packet,
7444  *     updates protocol checksum and also translates ICMP for code
7445  *     along with inner header translation.
7446  *
7447  * @param ifp Pointer to the interface
7448  * @param proto_family pointer to protocol family. It is updated if function
7449  *     performs the translation successfully.
7450  * @param m Pointer to the pointer pointing to the packet. Needed because this
7451  *     routine can end up changing the mbuf to a different one.
7452  *
7453  * @return 0 on success or else a negative value.
7454  */
7455 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7456 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7457 {
7458 	VERIFY(*proto_family == PF_INET);
7459 	VERIFY(IS_INTF_CLAT46(ifp));
7460 
7461 	pbuf_t pbuf_store, *pbuf = NULL;
7462 	struct ip *iph = NULL;
7463 	struct in_addr osrc, odst;
7464 	uint8_t proto = 0;
7465 	struct in6_ifaddr *ia6_clat_src = NULL;
7466 	struct in6_addr *src = NULL;
7467 	struct in6_addr dst;
7468 	int error = 0;
7469 	uint16_t off = 0;
7470 	uint16_t tot_len = 0;
7471 	uint16_t ip_id_val = 0;
7472 	uint16_t ip_frag_off = 0;
7473 
7474 	boolean_t is_frag = FALSE;
7475 	boolean_t is_first_frag = TRUE;
7476 	boolean_t is_last_frag = TRUE;
7477 
7478 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7479 	pbuf = &pbuf_store;
7480 	iph = pbuf->pb_data;
7481 
7482 	osrc = iph->ip_src;
7483 	odst = iph->ip_dst;
7484 	proto = iph->ip_p;
7485 	off = (uint16_t)(iph->ip_hl << 2);
7486 	ip_id_val = iph->ip_id;
7487 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7488 
7489 	tot_len = ntohs(iph->ip_len);
7490 
7491 	/*
7492 	 * For packets that are not first frags
7493 	 * we only need to adjust CSUM.
7494 	 * For 4 to 6, Fragmentation header gets appended
7495 	 * after proto translation.
7496 	 */
7497 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7498 		is_frag = TRUE;
7499 
7500 		/* If the offset is not zero, it is not first frag */
7501 		if (ip_frag_off != 0) {
7502 			is_first_frag = FALSE;
7503 		}
7504 
7505 		/* If IP_MF is set, then it is not last frag */
7506 		if (ntohs(iph->ip_off) & IP_MF) {
7507 			is_last_frag = FALSE;
7508 		}
7509 	}
7510 
7511 	/*
7512 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7513 	 * translation.
7514 	 */
7515 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7516 	if (ia6_clat_src == NULL) {
7517 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7518 		error = -1;
7519 		goto cleanup;
7520 	}
7521 
7522 	src = &ia6_clat_src->ia_addr.sin6_addr;
7523 
7524 	/*
7525 	 * Translate IPv4 destination to IPv6 destination by using the
7526 	 * prefixes learned through prior PLAT discovery.
7527 	 */
7528 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7529 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7530 		goto cleanup;
7531 	}
7532 
7533 	/* Translate the IP header part first */
7534 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7535 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7536 
7537 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7538 
7539 	if (error != 0) {
7540 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7541 		goto cleanup;
7542 	}
7543 
7544 	/*
7545 	 * Translate protocol header, update checksum, checksum flags
7546 	 * and related fields.
7547 	 */
7548 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7549 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7550 
7551 	if (error != 0) {
7552 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7553 		goto cleanup;
7554 	}
7555 
7556 	/* Now insert the IPv6 fragment header */
7557 	if (is_frag) {
7558 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7559 
7560 		if (error != 0) {
7561 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7562 			goto cleanup;
7563 		}
7564 	}
7565 
7566 cleanup:
7567 	if (ia6_clat_src != NULL) {
7568 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7569 	}
7570 
7571 	if (pbuf_is_valid(pbuf)) {
7572 		*m = pbuf->pb_mbuf;
7573 		pbuf->pb_mbuf = NULL;
7574 		pbuf_destroy(pbuf);
7575 	} else {
7576 		error = -1;
7577 		*m = NULL;
7578 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7579 	}
7580 
7581 	if (error == 0) {
7582 		*proto_family = PF_INET6;
7583 		ip6stat.ip6s_clat464_out_success++;
7584 	}
7585 
7586 	return error;
7587 }
7588 
7589 /*
7590  * @brief This routine translates incoming IPv6 to IPv4 packet,
7591  *     updates protocol checksum and also translates ICMPv6 outer
7592  *     and inner headers
7593  *
7594  * @return 0 on success or else a negative value.
7595  */
7596 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7597 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7598 {
7599 	VERIFY(*proto_family == PF_INET6);
7600 	VERIFY(IS_INTF_CLAT46(ifp));
7601 
7602 	struct ip6_hdr *ip6h = NULL;
7603 	struct in6_addr osrc, odst;
7604 	uint8_t proto = 0;
7605 	struct in6_ifaddr *ia6_clat_dst = NULL;
7606 	struct in_ifaddr *ia4_clat_dst = NULL;
7607 	struct in_addr *dst = NULL;
7608 	struct in_addr src;
7609 	int error = 0;
7610 	uint32_t off = 0;
7611 	u_int64_t tot_len = 0;
7612 	uint8_t tos = 0;
7613 	boolean_t is_first_frag = TRUE;
7614 
7615 	/* Incoming mbuf does not contain valid IP6 header */
7616 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7617 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7618 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7619 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7620 		return -1;
7621 	}
7622 
7623 	ip6h = mtod(*m, struct ip6_hdr *);
7624 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7625 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7626 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7627 		return -1;
7628 	}
7629 
7630 	osrc = ip6h->ip6_src;
7631 	odst = ip6h->ip6_dst;
7632 
7633 	/*
7634 	 * Retrieve the local CLAT46 reserved IPv6 address.
7635 	 * Let the packet pass if we don't find one, as the flag
7636 	 * may get set before IPv6 configuration has taken place.
7637 	 */
7638 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7639 	if (ia6_clat_dst == NULL) {
7640 		goto done;
7641 	}
7642 
7643 	/*
7644 	 * Check if the original dest in the packet is same as the reserved
7645 	 * CLAT46 IPv6 address
7646 	 */
7647 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7648 		pbuf_t pbuf_store, *pbuf = NULL;
7649 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7650 		pbuf = &pbuf_store;
7651 
7652 		/*
7653 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7654 		 * translation.
7655 		 */
7656 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7657 		if (ia4_clat_dst == NULL) {
7658 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7659 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7660 			error = -1;
7661 			goto cleanup;
7662 		}
7663 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7664 
7665 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7666 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7667 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7668 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7669 			error = -1;
7670 			goto cleanup;
7671 		}
7672 
7673 		ip6h = pbuf->pb_data;
7674 		off = sizeof(struct ip6_hdr);
7675 		proto = ip6h->ip6_nxt;
7676 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7677 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7678 
7679 		/*
7680 		 * Translate the IP header and update the fragmentation
7681 		 * header if needed
7682 		 */
7683 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7684 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7685 		    0 : -1;
7686 
7687 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7688 
7689 		if (error != 0) {
7690 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7691 			goto cleanup;
7692 		}
7693 
7694 		/*
7695 		 * Translate protocol header, update checksum, checksum flags
7696 		 * and related fields.
7697 		 */
7698 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7699 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7700 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7701 
7702 		if (error != 0) {
7703 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7704 			goto cleanup;
7705 		}
7706 
7707 cleanup:
7708 		if (ia4_clat_dst != NULL) {
7709 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7710 		}
7711 
7712 		if (pbuf_is_valid(pbuf)) {
7713 			*m = pbuf->pb_mbuf;
7714 			pbuf->pb_mbuf = NULL;
7715 			pbuf_destroy(pbuf);
7716 		} else {
7717 			error = -1;
7718 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7719 		}
7720 
7721 		if (error == 0) {
7722 			*proto_family = PF_INET;
7723 			ip6stat.ip6s_clat464_in_success++;
7724 		}
7725 	} /* CLAT traffic */
7726 
7727 done:
7728 	return error;
7729 }
7730 
7731 /* The following is used to enqueue work items for ifnet ioctl events */
7732 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7733 
7734 struct ifnet_ioctl_event {
7735 	struct ifnet *ifp;
7736 	u_long ioctl_code;
7737 };
7738 
7739 struct ifnet_ioctl_event_nwk_wq_entry {
7740 	struct nwk_wq_entry nwk_wqe;
7741 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7742 };
7743 
7744 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7745 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7746 {
7747 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7748 	bool compare_expected;
7749 
7750 	/*
7751 	 * Get an io ref count if the interface is attached.
7752 	 * At this point it most likely is. We are taking a reference for
7753 	 * deferred processing.
7754 	 */
7755 	if (!ifnet_is_attached(ifp, 1)) {
7756 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7757 		    "is not attached",
7758 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7759 		return;
7760 	}
7761 	switch (ioctl_code) {
7762 	case SIOCADDMULTI:
7763 		compare_expected = false;
7764 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7765 			ifnet_decr_iorefcnt(ifp);
7766 			return;
7767 		}
7768 		break;
7769 	case SIOCDELMULTI:
7770 		compare_expected = false;
7771 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7772 			ifnet_decr_iorefcnt(ifp);
7773 			return;
7774 		}
7775 		break;
7776 	default:
7777 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7778 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7779 		return;
7780 	}
7781 
7782 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7783 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7784 
7785 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7786 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7787 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7788 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7789 }
7790 
7791 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7792 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7793 {
7794 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7795 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7796 
7797 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7798 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7799 	int ret = 0;
7800 
7801 	switch (ioctl_code) {
7802 	case SIOCADDMULTI:
7803 		atomic_store(&ifp->if_mcast_add_signaled, false);
7804 		break;
7805 	case SIOCDELMULTI:
7806 		atomic_store(&ifp->if_mcast_del_signaled, false);
7807 		break;
7808 	}
7809 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7810 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7811 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7812 	} else if (dlil_verbose) {
7813 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7814 		    "for ioctl %lu",
7815 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7816 	}
7817 	ifnet_decr_iorefcnt(ifp);
7818 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7819 	return;
7820 }
7821 
7822 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7823 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7824     void *ioctl_arg)
7825 {
7826 	struct ifnet_filter *filter;
7827 	int retval = EOPNOTSUPP;
7828 	int result = 0;
7829 
7830 	if (ifp == NULL || ioctl_code == 0) {
7831 		return EINVAL;
7832 	}
7833 
7834 	/* Get an io ref count if the interface is attached */
7835 	if (!ifnet_is_attached(ifp, 1)) {
7836 		return EOPNOTSUPP;
7837 	}
7838 
7839 	/*
7840 	 * Run the interface filters first.
7841 	 * We want to run all filters before calling the protocol,
7842 	 * interface family, or interface.
7843 	 */
7844 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7845 	/* prevent filter list from changing in case we drop the lock */
7846 	if_flt_monitor_busy(ifp);
7847 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7848 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7849 		    filter->filt_protocol == proto_fam)) {
7850 			lck_mtx_unlock(&ifp->if_flt_lock);
7851 
7852 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7853 			    proto_fam, ioctl_code, ioctl_arg);
7854 
7855 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7856 
7857 			/* Only update retval if no one has handled the ioctl */
7858 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7859 				if (result == ENOTSUP) {
7860 					result = EOPNOTSUPP;
7861 				}
7862 				retval = result;
7863 				if (retval != 0 && retval != EOPNOTSUPP) {
7864 					/* we're done with the filter list */
7865 					if_flt_monitor_unbusy(ifp);
7866 					lck_mtx_unlock(&ifp->if_flt_lock);
7867 					goto cleanup;
7868 				}
7869 			}
7870 		}
7871 	}
7872 	/* we're done with the filter list */
7873 	if_flt_monitor_unbusy(ifp);
7874 	lck_mtx_unlock(&ifp->if_flt_lock);
7875 
7876 	/* Allow the protocol to handle the ioctl */
7877 	if (proto_fam != 0) {
7878 		struct if_proto *proto;
7879 
7880 		/* callee holds a proto refcnt upon success */
7881 		ifnet_lock_shared(ifp);
7882 		proto = find_attached_proto(ifp, proto_fam);
7883 		ifnet_lock_done(ifp);
7884 		if (proto != NULL) {
7885 			proto_media_ioctl ioctlp =
7886 			    (proto->proto_kpi == kProtoKPI_v1 ?
7887 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7888 			result = EOPNOTSUPP;
7889 			if (ioctlp != NULL) {
7890 				result = ioctlp(ifp, proto_fam, ioctl_code,
7891 				    ioctl_arg);
7892 			}
7893 			if_proto_free(proto);
7894 
7895 			/* Only update retval if no one has handled the ioctl */
7896 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7897 				if (result == ENOTSUP) {
7898 					result = EOPNOTSUPP;
7899 				}
7900 				retval = result;
7901 				if (retval && retval != EOPNOTSUPP) {
7902 					goto cleanup;
7903 				}
7904 			}
7905 		}
7906 	}
7907 
7908 	/* retval is either 0 or EOPNOTSUPP */
7909 
7910 	/*
7911 	 * Let the interface handle this ioctl.
7912 	 * If it returns EOPNOTSUPP, ignore that, we may have
7913 	 * already handled this in the protocol or family.
7914 	 */
7915 	if (ifp->if_ioctl) {
7916 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7917 	}
7918 
7919 	/* Only update retval if no one has handled the ioctl */
7920 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7921 		if (result == ENOTSUP) {
7922 			result = EOPNOTSUPP;
7923 		}
7924 		retval = result;
7925 		if (retval && retval != EOPNOTSUPP) {
7926 			goto cleanup;
7927 		}
7928 	}
7929 
7930 cleanup:
7931 	if (retval == EJUSTRETURN) {
7932 		retval = 0;
7933 	}
7934 
7935 	ifnet_decr_iorefcnt(ifp);
7936 
7937 	return retval;
7938 }
7939 
7940 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7941 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7942 {
7943 	errno_t error = 0;
7944 
7945 	if (ifp->if_set_bpf_tap) {
7946 		/* Get an io reference on the interface if it is attached */
7947 		if (!ifnet_is_attached(ifp, 1)) {
7948 			return ENXIO;
7949 		}
7950 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7951 		ifnet_decr_iorefcnt(ifp);
7952 	}
7953 	return error;
7954 }
7955 
7956 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7957 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7958     struct sockaddr *ll_addr, size_t ll_len)
7959 {
7960 	errno_t result = EOPNOTSUPP;
7961 	struct if_proto *proto;
7962 	const struct sockaddr *verify;
7963 	proto_media_resolve_multi resolvep;
7964 
7965 	if (!ifnet_is_attached(ifp, 1)) {
7966 		return result;
7967 	}
7968 
7969 	bzero(ll_addr, ll_len);
7970 
7971 	/* Call the protocol first; callee holds a proto refcnt upon success */
7972 	ifnet_lock_shared(ifp);
7973 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7974 	ifnet_lock_done(ifp);
7975 	if (proto != NULL) {
7976 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7977 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7978 		if (resolvep != NULL) {
7979 			result = resolvep(ifp, proto_addr,
7980 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7981 		}
7982 		if_proto_free(proto);
7983 	}
7984 
7985 	/* Let the interface verify the multicast address */
7986 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7987 		if (result == 0) {
7988 			verify = ll_addr;
7989 		} else {
7990 			verify = proto_addr;
7991 		}
7992 		result = ifp->if_check_multi(ifp, verify);
7993 	}
7994 
7995 	ifnet_decr_iorefcnt(ifp);
7996 	return result;
7997 }
7998 
7999 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8000 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8001     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8002     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8003 {
8004 	struct if_proto *proto;
8005 	errno_t result = 0;
8006 
8007 	if ((ifp->if_flags & IFF_NOARP) != 0) {
8008 		result = ENOTSUP;
8009 		goto done;
8010 	}
8011 
8012 	/* callee holds a proto refcnt upon success */
8013 	ifnet_lock_shared(ifp);
8014 	proto = find_attached_proto(ifp, target_proto->sa_family);
8015 	ifnet_lock_done(ifp);
8016 	if (proto == NULL) {
8017 		result = ENOTSUP;
8018 	} else {
8019 		proto_media_send_arp    arpp;
8020 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8021 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8022 		if (arpp == NULL) {
8023 			result = ENOTSUP;
8024 		} else {
8025 			switch (arpop) {
8026 			case ARPOP_REQUEST:
8027 				arpstat.txrequests++;
8028 				if (target_hw != NULL) {
8029 					arpstat.txurequests++;
8030 				}
8031 				break;
8032 			case ARPOP_REPLY:
8033 				arpstat.txreplies++;
8034 				break;
8035 			}
8036 			result = arpp(ifp, arpop, sender_hw, sender_proto,
8037 			    target_hw, target_proto);
8038 		}
8039 		if_proto_free(proto);
8040 	}
8041 done:
8042 	return result;
8043 }
8044 
8045 struct net_thread_marks { };
8046 static const struct net_thread_marks net_thread_marks_base = { };
8047 
8048 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8049     &net_thread_marks_base;
8050 
8051 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8052 net_thread_marks_push(u_int32_t push)
8053 {
8054 	static const char *const base = (const void*)&net_thread_marks_base;
8055 	u_int32_t pop = 0;
8056 
8057 	if (push != 0) {
8058 		struct uthread *uth = current_uthread();
8059 
8060 		pop = push & ~uth->uu_network_marks;
8061 		if (pop != 0) {
8062 			uth->uu_network_marks |= pop;
8063 		}
8064 	}
8065 
8066 	return (net_thread_marks_t)&base[pop];
8067 }
8068 
8069 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8070 net_thread_unmarks_push(u_int32_t unpush)
8071 {
8072 	static const char *const base = (const void*)&net_thread_marks_base;
8073 	u_int32_t unpop = 0;
8074 
8075 	if (unpush != 0) {
8076 		struct uthread *uth = current_uthread();
8077 
8078 		unpop = unpush & uth->uu_network_marks;
8079 		if (unpop != 0) {
8080 			uth->uu_network_marks &= ~unpop;
8081 		}
8082 	}
8083 
8084 	return (net_thread_marks_t)&base[unpop];
8085 }
8086 
8087 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8088 net_thread_marks_pop(net_thread_marks_t popx)
8089 {
8090 	static const char *const base = (const void*)&net_thread_marks_base;
8091 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
8092 
8093 	if (pop != 0) {
8094 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8095 		struct uthread *uth = current_uthread();
8096 
8097 		VERIFY((pop & ones) == pop);
8098 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8099 		uth->uu_network_marks &= ~pop;
8100 	}
8101 }
8102 
8103 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8104 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8105 {
8106 	static const char *const base = (const void*)&net_thread_marks_base;
8107 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8108 
8109 	if (unpop != 0) {
8110 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8111 		struct uthread *uth = current_uthread();
8112 
8113 		VERIFY((unpop & ones) == unpop);
8114 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8115 		uth->uu_network_marks |= unpop;
8116 	}
8117 }
8118 
8119 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8120 net_thread_is_marked(u_int32_t check)
8121 {
8122 	if (check != 0) {
8123 		struct uthread *uth = current_uthread();
8124 		return uth->uu_network_marks & check;
8125 	} else {
8126 		return 0;
8127 	}
8128 }
8129 
8130 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8131 net_thread_is_unmarked(u_int32_t check)
8132 {
8133 	if (check != 0) {
8134 		struct uthread *uth = current_uthread();
8135 		return ~uth->uu_network_marks & check;
8136 	} else {
8137 		return 0;
8138 	}
8139 }
8140 
8141 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8142 _is_announcement(const struct sockaddr_in * sender_sin,
8143     const struct sockaddr_in * target_sin)
8144 {
8145 	if (target_sin == NULL || sender_sin == NULL) {
8146 		return FALSE;
8147 	}
8148 
8149 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8150 }
8151 
8152 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8153 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8154     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8155     const struct sockaddr *target_proto0, u_int32_t rtflags)
8156 {
8157 	errno_t result = 0;
8158 	const struct sockaddr_in * sender_sin;
8159 	const struct sockaddr_in * target_sin;
8160 	struct sockaddr_inarp target_proto_sinarp;
8161 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8162 
8163 	if (target_proto == NULL || sender_proto == NULL) {
8164 		return EINVAL;
8165 	}
8166 
8167 	if (sender_proto->sa_family != target_proto->sa_family) {
8168 		return EINVAL;
8169 	}
8170 
8171 	/*
8172 	 * If the target is a (default) router, provide that
8173 	 * information to the send_arp callback routine.
8174 	 */
8175 	if (rtflags & RTF_ROUTER) {
8176 		bcopy(target_proto, &target_proto_sinarp,
8177 		    sizeof(struct sockaddr_in));
8178 		target_proto_sinarp.sin_other |= SIN_ROUTER;
8179 		target_proto = (struct sockaddr *)&target_proto_sinarp;
8180 	}
8181 
8182 	/*
8183 	 * If this is an ARP request and the target IP is IPv4LL,
8184 	 * send the request on all interfaces.  The exception is
8185 	 * an announcement, which must only appear on the specific
8186 	 * interface.
8187 	 */
8188 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8189 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8190 	if (target_proto->sa_family == AF_INET &&
8191 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8192 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8193 	    !_is_announcement(sender_sin, target_sin)) {
8194 		ifnet_t         *ifp_list;
8195 		u_int32_t       count;
8196 		u_int32_t       ifp_on;
8197 
8198 		result = ENOTSUP;
8199 
8200 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8201 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
8202 				errno_t new_result;
8203 				ifaddr_t source_hw = NULL;
8204 				ifaddr_t source_ip = NULL;
8205 				struct sockaddr_in source_ip_copy;
8206 				struct ifnet *cur_ifp = ifp_list[ifp_on];
8207 
8208 				/*
8209 				 * Only arp on interfaces marked for IPv4LL
8210 				 * ARPing.  This may mean that we don't ARP on
8211 				 * the interface the subnet route points to.
8212 				 */
8213 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8214 					continue;
8215 				}
8216 
8217 				/* Find the source IP address */
8218 				ifnet_lock_shared(cur_ifp);
8219 				source_hw = cur_ifp->if_lladdr;
8220 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8221 				    ifa_link) {
8222 					IFA_LOCK(source_ip);
8223 					if (source_ip->ifa_addr != NULL &&
8224 					    source_ip->ifa_addr->sa_family ==
8225 					    AF_INET) {
8226 						/* Copy the source IP address */
8227 						source_ip_copy =
8228 						    *(struct sockaddr_in *)
8229 						    (void *)source_ip->ifa_addr;
8230 						IFA_UNLOCK(source_ip);
8231 						break;
8232 					}
8233 					IFA_UNLOCK(source_ip);
8234 				}
8235 
8236 				/* No IP Source, don't arp */
8237 				if (source_ip == NULL) {
8238 					ifnet_lock_done(cur_ifp);
8239 					continue;
8240 				}
8241 
8242 				IFA_ADDREF(source_hw);
8243 				ifnet_lock_done(cur_ifp);
8244 
8245 				/* Send the ARP */
8246 				new_result = dlil_send_arp_internal(cur_ifp,
8247 				    arpop, (struct sockaddr_dl *)(void *)
8248 				    source_hw->ifa_addr,
8249 				    (struct sockaddr *)&source_ip_copy, NULL,
8250 				    target_proto);
8251 
8252 				IFA_REMREF(source_hw);
8253 				if (result == ENOTSUP) {
8254 					result = new_result;
8255 				}
8256 			}
8257 			ifnet_list_free(ifp_list);
8258 		}
8259 	} else {
8260 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8261 		    sender_proto, target_hw, target_proto);
8262 	}
8263 
8264 	return result;
8265 }
8266 
8267 /*
8268  * Caller must hold ifnet head lock.
8269  */
8270 static int
ifnet_lookup(struct ifnet * ifp)8271 ifnet_lookup(struct ifnet *ifp)
8272 {
8273 	struct ifnet *_ifp;
8274 
8275 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8276 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8277 		if (_ifp == ifp) {
8278 			break;
8279 		}
8280 	}
8281 	return _ifp != NULL;
8282 }
8283 
8284 /*
8285  * Caller has to pass a non-zero refio argument to get a
8286  * IO reference count. This will prevent ifnet_detach from
8287  * being called when there are outstanding io reference counts.
8288  */
8289 int
ifnet_is_attached(struct ifnet * ifp,int refio)8290 ifnet_is_attached(struct ifnet *ifp, int refio)
8291 {
8292 	int ret;
8293 
8294 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8295 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
8296 		if (refio > 0) {
8297 			ifp->if_refio++;
8298 		}
8299 	}
8300 	lck_mtx_unlock(&ifp->if_ref_lock);
8301 
8302 	return ret;
8303 }
8304 
8305 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8306 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8307 {
8308 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8309 	ifp->if_threads_pending++;
8310 	lck_mtx_unlock(&ifp->if_ref_lock);
8311 }
8312 
8313 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8314 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8315 {
8316 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8317 	VERIFY(ifp->if_threads_pending > 0);
8318 	ifp->if_threads_pending--;
8319 	if (ifp->if_threads_pending == 0) {
8320 		wakeup(&ifp->if_threads_pending);
8321 	}
8322 	lck_mtx_unlock(&ifp->if_ref_lock);
8323 }
8324 
8325 /*
8326  * Caller must ensure the interface is attached; the assumption is that
8327  * there is at least an outstanding IO reference count held already.
8328  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8329  */
8330 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8331 ifnet_incr_iorefcnt(struct ifnet *ifp)
8332 {
8333 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8334 	VERIFY(IF_FULLY_ATTACHED(ifp));
8335 	VERIFY(ifp->if_refio > 0);
8336 	ifp->if_refio++;
8337 	lck_mtx_unlock(&ifp->if_ref_lock);
8338 }
8339 
8340 __attribute__((always_inline))
8341 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8342 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8343 {
8344 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8345 
8346 	VERIFY(ifp->if_refio > 0);
8347 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8348 
8349 	ifp->if_refio--;
8350 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8351 
8352 	/*
8353 	 * if there are no more outstanding io references, wakeup the
8354 	 * ifnet_detach thread if detaching flag is set.
8355 	 */
8356 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8357 		wakeup(&(ifp->if_refio));
8358 	}
8359 }
8360 
8361 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8362 ifnet_decr_iorefcnt(struct ifnet *ifp)
8363 {
8364 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8365 	ifnet_decr_iorefcnt_locked(ifp);
8366 	lck_mtx_unlock(&ifp->if_ref_lock);
8367 }
8368 
8369 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8370 ifnet_datamov_begin(struct ifnet *ifp)
8371 {
8372 	boolean_t ret;
8373 
8374 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8375 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8376 		ifp->if_refio++;
8377 		ifp->if_datamov++;
8378 	}
8379 	lck_mtx_unlock(&ifp->if_ref_lock);
8380 
8381 	return ret;
8382 }
8383 
8384 void
ifnet_datamov_end(struct ifnet * ifp)8385 ifnet_datamov_end(struct ifnet *ifp)
8386 {
8387 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8388 	VERIFY(ifp->if_datamov > 0);
8389 	/*
8390 	 * if there's no more thread moving data, wakeup any
8391 	 * drainers that's blocked waiting for this.
8392 	 */
8393 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8394 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8395 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8396 		wakeup(&(ifp->if_datamov));
8397 	}
8398 	ifnet_decr_iorefcnt_locked(ifp);
8399 	lck_mtx_unlock(&ifp->if_ref_lock);
8400 }
8401 
8402 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8403 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8404 {
8405 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8406 	ifp->if_refio++;
8407 	if (ifp->if_suspend++ == 0) {
8408 		VERIFY(ifp->if_refflags & IFRF_READY);
8409 		ifp->if_refflags &= ~IFRF_READY;
8410 	}
8411 }
8412 
8413 void
ifnet_datamov_suspend(struct ifnet * ifp)8414 ifnet_datamov_suspend(struct ifnet *ifp)
8415 {
8416 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8417 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8418 	ifnet_datamov_suspend_locked(ifp);
8419 	lck_mtx_unlock(&ifp->if_ref_lock);
8420 }
8421 
8422 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8423 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8424 {
8425 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8426 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8427 	if (ifp->if_suspend > 0) {
8428 		lck_mtx_unlock(&ifp->if_ref_lock);
8429 		return FALSE;
8430 	}
8431 	ifnet_datamov_suspend_locked(ifp);
8432 	lck_mtx_unlock(&ifp->if_ref_lock);
8433 	return TRUE;
8434 }
8435 
8436 void
ifnet_datamov_drain(struct ifnet * ifp)8437 ifnet_datamov_drain(struct ifnet *ifp)
8438 {
8439 	lck_mtx_lock(&ifp->if_ref_lock);
8440 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8441 	/* data movement must already be suspended */
8442 	VERIFY(ifp->if_suspend > 0);
8443 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8444 	ifp->if_drainers++;
8445 	while (ifp->if_datamov != 0) {
8446 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8447 		    if_name(ifp));
8448 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8449 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8450 		    (PZERO - 1), __func__, NULL);
8451 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8452 	}
8453 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8454 	VERIFY(ifp->if_drainers > 0);
8455 	ifp->if_drainers--;
8456 	lck_mtx_unlock(&ifp->if_ref_lock);
8457 
8458 	/* purge the interface queues */
8459 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8460 		if_qflush_snd(ifp, false);
8461 	}
8462 }
8463 
8464 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8465 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8466 {
8467 	ifnet_datamov_suspend(ifp);
8468 	ifnet_datamov_drain(ifp);
8469 }
8470 
8471 void
ifnet_datamov_resume(struct ifnet * ifp)8472 ifnet_datamov_resume(struct ifnet *ifp)
8473 {
8474 	lck_mtx_lock(&ifp->if_ref_lock);
8475 	/* data movement must already be suspended */
8476 	VERIFY(ifp->if_suspend > 0);
8477 	if (--ifp->if_suspend == 0) {
8478 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8479 		ifp->if_refflags |= IFRF_READY;
8480 	}
8481 	ifnet_decr_iorefcnt_locked(ifp);
8482 	lck_mtx_unlock(&ifp->if_ref_lock);
8483 }
8484 
8485 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8486 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8487 {
8488 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8489 	ctrace_t *tr;
8490 	u_int32_t idx;
8491 	u_int16_t *cnt;
8492 
8493 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8494 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8495 		/* NOTREACHED */
8496 	}
8497 
8498 	if (refhold) {
8499 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8500 		tr = dl_if_dbg->dldbg_if_refhold;
8501 	} else {
8502 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8503 		tr = dl_if_dbg->dldbg_if_refrele;
8504 	}
8505 
8506 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8507 	ctrace_record(&tr[idx]);
8508 }
8509 
8510 errno_t
dlil_if_ref(struct ifnet * ifp)8511 dlil_if_ref(struct ifnet *ifp)
8512 {
8513 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8514 
8515 	if (dl_if == NULL) {
8516 		return EINVAL;
8517 	}
8518 
8519 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8520 	++dl_if->dl_if_refcnt;
8521 	if (dl_if->dl_if_refcnt == 0) {
8522 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8523 		/* NOTREACHED */
8524 	}
8525 	if (dl_if->dl_if_trace != NULL) {
8526 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8527 	}
8528 	lck_mtx_unlock(&dl_if->dl_if_lock);
8529 
8530 	return 0;
8531 }
8532 
8533 errno_t
dlil_if_free(struct ifnet * ifp)8534 dlil_if_free(struct ifnet *ifp)
8535 {
8536 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8537 	bool need_release = FALSE;
8538 
8539 	if (dl_if == NULL) {
8540 		return EINVAL;
8541 	}
8542 
8543 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8544 	switch (dl_if->dl_if_refcnt) {
8545 	case 0:
8546 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8547 		/* NOTREACHED */
8548 		break;
8549 	case 1:
8550 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8551 			need_release = TRUE;
8552 		}
8553 		break;
8554 	default:
8555 		break;
8556 	}
8557 	--dl_if->dl_if_refcnt;
8558 	if (dl_if->dl_if_trace != NULL) {
8559 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8560 	}
8561 	lck_mtx_unlock(&dl_if->dl_if_lock);
8562 	if (need_release) {
8563 		_dlil_if_release(ifp, true);
8564 	}
8565 	return 0;
8566 }
8567 
8568 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8569 dlil_attach_protocol(struct if_proto *proto,
8570     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8571     uint32_t * proto_count)
8572 {
8573 	struct kev_dl_proto_data ev_pr_data;
8574 	struct ifnet *ifp = proto->ifp;
8575 	errno_t retval = 0;
8576 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8577 	struct if_proto *prev_proto;
8578 	struct if_proto *_proto;
8579 
8580 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8581 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8582 		return EINVAL;
8583 	}
8584 
8585 	if (!ifnet_is_attached(ifp, 1)) {
8586 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8587 		    __func__, if_name(ifp));
8588 		return ENXIO;
8589 	}
8590 	/* callee holds a proto refcnt upon success */
8591 	ifnet_lock_exclusive(ifp);
8592 	_proto = find_attached_proto(ifp, proto->protocol_family);
8593 	if (_proto != NULL) {
8594 		ifnet_lock_done(ifp);
8595 		if_proto_free(_proto);
8596 		retval = EEXIST;
8597 		goto ioref_done;
8598 	}
8599 
8600 	/*
8601 	 * Call family module add_proto routine so it can refine the
8602 	 * demux descriptors as it wishes.
8603 	 */
8604 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8605 	    demux_count);
8606 	if (retval) {
8607 		ifnet_lock_done(ifp);
8608 		goto ioref_done;
8609 	}
8610 
8611 	/*
8612 	 * Insert the protocol in the hash
8613 	 */
8614 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8615 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8616 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8617 	}
8618 	if (prev_proto) {
8619 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8620 	} else {
8621 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8622 		    proto, next_hash);
8623 	}
8624 
8625 	/* hold a proto refcnt for attach */
8626 	if_proto_ref(proto);
8627 
8628 	/*
8629 	 * The reserved field carries the number of protocol still attached
8630 	 * (subject to change)
8631 	 */
8632 	ev_pr_data.proto_family = proto->protocol_family;
8633 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8634 
8635 	ifnet_lock_done(ifp);
8636 
8637 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8638 	    (struct net_event_data *)&ev_pr_data,
8639 	    sizeof(struct kev_dl_proto_data), FALSE);
8640 	if (proto_count != NULL) {
8641 		*proto_count = ev_pr_data.proto_remaining_count;
8642 	}
8643 ioref_done:
8644 	ifnet_decr_iorefcnt(ifp);
8645 	return retval;
8646 }
8647 
8648 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8649 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8650 {
8651 	/*
8652 	 * A protocol has been attached, mark the interface up.
8653 	 * This used to be done by configd.KernelEventMonitor, but that
8654 	 * is inherently prone to races (rdar://problem/30810208).
8655 	 */
8656 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8657 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8658 	dlil_post_sifflags_msg(ifp);
8659 #if SKYWALK
8660 	switch (protocol) {
8661 	case AF_INET:
8662 	case AF_INET6:
8663 		/* don't attach the flowswitch unless attaching IP */
8664 		dlil_attach_flowswitch_nexus(ifp);
8665 		break;
8666 	default:
8667 		break;
8668 	}
8669 #endif /* SKYWALK */
8670 }
8671 
8672 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8673 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8674     const struct ifnet_attach_proto_param *proto_details)
8675 {
8676 	int retval = 0;
8677 	struct if_proto  *ifproto = NULL;
8678 	uint32_t proto_count = 0;
8679 
8680 	ifnet_head_lock_shared();
8681 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8682 		retval = EINVAL;
8683 		goto end;
8684 	}
8685 	/* Check that the interface is in the global list */
8686 	if (!ifnet_lookup(ifp)) {
8687 		retval = ENXIO;
8688 		goto end;
8689 	}
8690 
8691 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8692 
8693 	/* refcnt held above during lookup */
8694 	ifproto->ifp = ifp;
8695 	ifproto->protocol_family = protocol;
8696 	ifproto->proto_kpi = kProtoKPI_v1;
8697 	ifproto->kpi.v1.input = proto_details->input;
8698 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8699 	ifproto->kpi.v1.event = proto_details->event;
8700 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8701 	ifproto->kpi.v1.detached = proto_details->detached;
8702 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8703 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8704 
8705 	retval = dlil_attach_protocol(ifproto,
8706 	    proto_details->demux_list, proto_details->demux_count,
8707 	    &proto_count);
8708 
8709 end:
8710 	if (retval == EEXIST) {
8711 		/* already attached */
8712 		if (dlil_verbose) {
8713 			DLIL_PRINTF("%s: protocol %d already attached\n",
8714 			    ifp != NULL ? if_name(ifp) : "N/A",
8715 			    protocol);
8716 		}
8717 	} else if (retval != 0) {
8718 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8719 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8720 	} else if (dlil_verbose) {
8721 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8722 		    ifp != NULL ? if_name(ifp) : "N/A",
8723 		    protocol, proto_count);
8724 	}
8725 	ifnet_head_done();
8726 	if (retval == 0) {
8727 		dlil_handle_proto_attach(ifp, protocol);
8728 	} else if (ifproto != NULL) {
8729 		zfree(dlif_proto_zone, ifproto);
8730 	}
8731 	return retval;
8732 }
8733 
8734 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8735 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8736     const struct ifnet_attach_proto_param_v2 *proto_details)
8737 {
8738 	int retval = 0;
8739 	struct if_proto  *ifproto = NULL;
8740 	uint32_t proto_count = 0;
8741 
8742 	ifnet_head_lock_shared();
8743 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8744 		retval = EINVAL;
8745 		goto end;
8746 	}
8747 	/* Check that the interface is in the global list */
8748 	if (!ifnet_lookup(ifp)) {
8749 		retval = ENXIO;
8750 		goto end;
8751 	}
8752 
8753 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8754 
8755 	/* refcnt held above during lookup */
8756 	ifproto->ifp = ifp;
8757 	ifproto->protocol_family = protocol;
8758 	ifproto->proto_kpi = kProtoKPI_v2;
8759 	ifproto->kpi.v2.input = proto_details->input;
8760 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8761 	ifproto->kpi.v2.event = proto_details->event;
8762 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8763 	ifproto->kpi.v2.detached = proto_details->detached;
8764 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8765 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8766 
8767 	retval = dlil_attach_protocol(ifproto,
8768 	    proto_details->demux_list, proto_details->demux_count,
8769 	    &proto_count);
8770 
8771 end:
8772 	if (retval == EEXIST) {
8773 		/* already attached */
8774 		if (dlil_verbose) {
8775 			DLIL_PRINTF("%s: protocol %d already attached\n",
8776 			    ifp != NULL ? if_name(ifp) : "N/A",
8777 			    protocol);
8778 		}
8779 	} else if (retval != 0) {
8780 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8781 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8782 	} else if (dlil_verbose) {
8783 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8784 		    ifp != NULL ? if_name(ifp) : "N/A",
8785 		    protocol, proto_count);
8786 	}
8787 	ifnet_head_done();
8788 	if (retval == 0) {
8789 		dlil_handle_proto_attach(ifp, protocol);
8790 	} else if (ifproto != NULL) {
8791 		zfree(dlif_proto_zone, ifproto);
8792 	}
8793 	return retval;
8794 }
8795 
8796 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8797 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8798 {
8799 	struct if_proto *proto = NULL;
8800 	int     retval = 0;
8801 
8802 	if (ifp == NULL || proto_family == 0) {
8803 		retval = EINVAL;
8804 		goto end;
8805 	}
8806 
8807 	ifnet_lock_exclusive(ifp);
8808 	/* callee holds a proto refcnt upon success */
8809 	proto = find_attached_proto(ifp, proto_family);
8810 	if (proto == NULL) {
8811 		retval = ENXIO;
8812 		ifnet_lock_done(ifp);
8813 		goto end;
8814 	}
8815 
8816 	/* call family module del_proto */
8817 	if (ifp->if_del_proto) {
8818 		ifp->if_del_proto(ifp, proto->protocol_family);
8819 	}
8820 
8821 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8822 	    proto, if_proto, next_hash);
8823 
8824 	if (proto->proto_kpi == kProtoKPI_v1) {
8825 		proto->kpi.v1.input = ifproto_media_input_v1;
8826 		proto->kpi.v1.pre_output = ifproto_media_preout;
8827 		proto->kpi.v1.event = ifproto_media_event;
8828 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8829 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8830 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8831 	} else {
8832 		proto->kpi.v2.input = ifproto_media_input_v2;
8833 		proto->kpi.v2.pre_output = ifproto_media_preout;
8834 		proto->kpi.v2.event = ifproto_media_event;
8835 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8836 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8837 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8838 	}
8839 	proto->detached = 1;
8840 	ifnet_lock_done(ifp);
8841 
8842 	if (dlil_verbose) {
8843 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8844 		    (proto->proto_kpi == kProtoKPI_v1) ?
8845 		    "v1" : "v2", proto_family);
8846 	}
8847 
8848 	/* release proto refcnt held during protocol attach */
8849 	if_proto_free(proto);
8850 
8851 	/*
8852 	 * Release proto refcnt held during lookup; the rest of
8853 	 * protocol detach steps will happen when the last proto
8854 	 * reference is released.
8855 	 */
8856 	if_proto_free(proto);
8857 
8858 end:
8859 	return retval;
8860 }
8861 
8862 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8863 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8864     struct mbuf *packet, char *header)
8865 {
8866 #pragma unused(ifp, protocol, packet, header)
8867 	return ENXIO;
8868 }
8869 
8870 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8871 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8872     struct mbuf *packet)
8873 {
8874 #pragma unused(ifp, protocol, packet)
8875 	return ENXIO;
8876 }
8877 
8878 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8879 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8880     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8881     char *link_layer_dest)
8882 {
8883 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8884 	return ENXIO;
8885 }
8886 
8887 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8888 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8889     const struct kev_msg *event)
8890 {
8891 #pragma unused(ifp, protocol, event)
8892 }
8893 
8894 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8895 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8896     unsigned long command, void *argument)
8897 {
8898 #pragma unused(ifp, protocol, command, argument)
8899 	return ENXIO;
8900 }
8901 
8902 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8903 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8904     struct sockaddr_dl *out_ll, size_t ll_len)
8905 {
8906 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8907 	return ENXIO;
8908 }
8909 
8910 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8911 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8912     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8913     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8914 {
8915 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8916 	return ENXIO;
8917 }
8918 
8919 extern int if_next_index(void);
8920 extern int tcp_ecn_outbound;
8921 
8922 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8923 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8924 {
8925 	uint32_t sflags = 0;
8926 	int err;
8927 
8928 	if (if_flowadv) {
8929 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8930 	}
8931 
8932 	if (if_delaybased_queue) {
8933 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8934 	}
8935 
8936 	if (ifp->if_output_sched_model ==
8937 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8938 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8939 	}
8940 	/* Inherit drop limit from the default queue */
8941 	if (ifp->if_snd != ifcq) {
8942 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8943 	}
8944 	/* Initialize transmit queue(s) */
8945 	err = ifclassq_setup(ifcq, ifp, sflags);
8946 	if (err != 0) {
8947 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8948 		    "err=%d", __func__, ifp, err);
8949 		/* NOTREACHED */
8950 	}
8951 }
8952 
8953 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8954 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8955 {
8956 #if SKYWALK
8957 	boolean_t netif_compat;
8958 	if_nexus_netif  nexus_netif;
8959 #endif /* SKYWALK */
8960 	struct ifnet *tmp_if;
8961 	struct ifaddr *ifa;
8962 	struct if_data_internal if_data_saved;
8963 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8964 	struct dlil_threading_info *dl_inp;
8965 	thread_continue_t thfunc = NULL;
8966 	int err;
8967 
8968 	if (ifp == NULL) {
8969 		return EINVAL;
8970 	}
8971 
8972 	/*
8973 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8974 	 * prevent the interface from being configured while it is
8975 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8976 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8977 	 */
8978 	dlil_if_lock();
8979 	ifnet_head_lock_exclusive();
8980 	/* Verify we aren't already on the list */
8981 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8982 		if (tmp_if == ifp) {
8983 			ifnet_head_done();
8984 			dlil_if_unlock();
8985 			return EEXIST;
8986 		}
8987 	}
8988 
8989 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8990 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8991 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8992 		    __func__, ifp);
8993 		/* NOTREACHED */
8994 	}
8995 	lck_mtx_unlock(&ifp->if_ref_lock);
8996 
8997 	ifnet_lock_exclusive(ifp);
8998 
8999 	/* Sanity check */
9000 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9001 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9002 	VERIFY(ifp->if_threads_pending == 0);
9003 
9004 	if (ll_addr != NULL) {
9005 		if (ifp->if_addrlen == 0) {
9006 			ifp->if_addrlen = ll_addr->sdl_alen;
9007 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9008 			ifnet_lock_done(ifp);
9009 			ifnet_head_done();
9010 			dlil_if_unlock();
9011 			return EINVAL;
9012 		}
9013 	}
9014 
9015 	/*
9016 	 * Allow interfaces without protocol families to attach
9017 	 * only if they have the necessary fields filled out.
9018 	 */
9019 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9020 		DLIL_PRINTF("%s: Attempt to attach interface without "
9021 		    "family module - %d\n", __func__, ifp->if_family);
9022 		ifnet_lock_done(ifp);
9023 		ifnet_head_done();
9024 		dlil_if_unlock();
9025 		return ENODEV;
9026 	}
9027 
9028 	/* Allocate protocol hash table */
9029 	VERIFY(ifp->if_proto_hash == NULL);
9030 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9031 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9032 
9033 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9034 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9035 	TAILQ_INIT(&ifp->if_flt_head);
9036 	VERIFY(ifp->if_flt_busy == 0);
9037 	VERIFY(ifp->if_flt_waiters == 0);
9038 	VERIFY(ifp->if_flt_non_os_count == 0);
9039 	VERIFY(ifp->if_flt_no_tso_count == 0);
9040 	lck_mtx_unlock(&ifp->if_flt_lock);
9041 
9042 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9043 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9044 		LIST_INIT(&ifp->if_multiaddrs);
9045 	}
9046 
9047 	VERIFY(ifp->if_allhostsinm == NULL);
9048 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9049 	TAILQ_INIT(&ifp->if_addrhead);
9050 
9051 	if (ifp->if_index == 0) {
9052 		int idx = if_next_index();
9053 
9054 		/*
9055 		 * Since we exhausted the list of
9056 		 * if_index's, try to find an empty slot
9057 		 * in ifindex2ifnet.
9058 		 */
9059 		if (idx == -1 && if_index >= UINT16_MAX) {
9060 			for (int i = 1; i < if_index; i++) {
9061 				if (ifindex2ifnet[i] == NULL &&
9062 				    ifnet_addrs[i - 1] == NULL) {
9063 					idx = i;
9064 					break;
9065 				}
9066 			}
9067 		}
9068 		if (idx == -1) {
9069 			ifp->if_index = 0;
9070 			ifnet_lock_done(ifp);
9071 			ifnet_head_done();
9072 			dlil_if_unlock();
9073 			return ENOBUFS;
9074 		}
9075 		ifp->if_index = (uint16_t)idx;
9076 
9077 		/* the lladdr passed at attach time is the permanent address */
9078 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9079 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9080 			bcopy(CONST_LLADDR(ll_addr),
9081 			    dl_if->dl_if_permanent_ether,
9082 			    ETHER_ADDR_LEN);
9083 			dl_if->dl_if_permanent_ether_is_set = 1;
9084 		}
9085 	}
9086 	/* There should not be anything occupying this slot */
9087 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9088 
9089 	/* allocate (if needed) and initialize a link address */
9090 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
9091 	if (ifa == NULL) {
9092 		ifnet_lock_done(ifp);
9093 		ifnet_head_done();
9094 		dlil_if_unlock();
9095 		return ENOBUFS;
9096 	}
9097 
9098 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9099 	ifnet_addrs[ifp->if_index - 1] = ifa;
9100 
9101 	/* make this address the first on the list */
9102 	IFA_LOCK(ifa);
9103 	/* hold a reference for ifnet_addrs[] */
9104 	IFA_ADDREF_LOCKED(ifa);
9105 	/* if_attach_link_ifa() holds a reference for ifa_link */
9106 	if_attach_link_ifa(ifp, ifa);
9107 	IFA_UNLOCK(ifa);
9108 
9109 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9110 	ifindex2ifnet[ifp->if_index] = ifp;
9111 
9112 	/* Hold a reference to the underlying dlil_ifnet */
9113 	ifnet_reference(ifp);
9114 
9115 	/* Clear stats (save and restore other fields that we care) */
9116 	if_data_saved = ifp->if_data;
9117 	bzero(&ifp->if_data, sizeof(ifp->if_data));
9118 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
9119 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9120 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9121 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9122 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9123 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9124 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9125 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9126 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9127 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9128 	ifnet_touch_lastchange(ifp);
9129 
9130 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9131 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9132 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9133 
9134 	dlil_ifclassq_setup(ifp, ifp->if_snd);
9135 
9136 	/* Sanity checks on the input thread storage */
9137 	dl_inp = &dl_if->dl_if_inpstorage;
9138 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9139 	VERIFY(dl_inp->dlth_flags == 0);
9140 	VERIFY(dl_inp->dlth_wtot == 0);
9141 	VERIFY(dl_inp->dlth_ifp == NULL);
9142 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9143 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9144 	VERIFY(!dl_inp->dlth_affinity);
9145 	VERIFY(ifp->if_inp == NULL);
9146 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9147 	VERIFY(dl_inp->dlth_strategy == NULL);
9148 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9149 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9150 	VERIFY(dl_inp->dlth_affinity_tag == 0);
9151 
9152 #if IFNET_INPUT_SANITY_CHK
9153 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
9154 #endif /* IFNET_INPUT_SANITY_CHK */
9155 
9156 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9157 	dlil_reset_rxpoll_params(ifp);
9158 	/*
9159 	 * A specific DLIL input thread is created per non-loopback interface.
9160 	 */
9161 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9162 		ifp->if_inp = dl_inp;
9163 		ifnet_incr_pending_thread_count(ifp);
9164 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9165 		if (err == ENODEV) {
9166 			VERIFY(thfunc == NULL);
9167 			ifnet_decr_pending_thread_count(ifp);
9168 		} else if (err != 0) {
9169 			panic_plain("%s: ifp=%p couldn't get an input thread; "
9170 			    "err=%d", __func__, ifp, err);
9171 			/* NOTREACHED */
9172 		}
9173 	}
9174 	/*
9175 	 * If the driver supports the new transmit model, calculate flow hash
9176 	 * and create a workloop starter thread to invoke the if_start callback
9177 	 * where the packets may be dequeued and transmitted.
9178 	 */
9179 	if (ifp->if_eflags & IFEF_TXSTART) {
9180 		thread_precedence_policy_data_t info;
9181 		__unused kern_return_t kret;
9182 
9183 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9184 		VERIFY(ifp->if_flowhash != 0);
9185 		VERIFY(ifp->if_start_thread == THREAD_NULL);
9186 
9187 		ifnet_set_start_cycle(ifp, NULL);
9188 		ifp->if_start_pacemaker_time = 0;
9189 		ifp->if_start_active = 0;
9190 		ifp->if_start_req = 0;
9191 		ifp->if_start_flags = 0;
9192 		VERIFY(ifp->if_start != NULL);
9193 		ifnet_incr_pending_thread_count(ifp);
9194 		if ((err = kernel_thread_start(ifnet_start_thread_func,
9195 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9196 			panic_plain("%s: "
9197 			    "ifp=%p couldn't get a start thread; "
9198 			    "err=%d", __func__, ifp, err);
9199 			/* NOTREACHED */
9200 		}
9201 		bzero(&info, sizeof(info));
9202 		info.importance = 1;
9203 		kret = thread_policy_set(ifp->if_start_thread,
9204 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9205 		    THREAD_PRECEDENCE_POLICY_COUNT);
9206 		ASSERT(kret == KERN_SUCCESS);
9207 	} else {
9208 		ifp->if_flowhash = 0;
9209 	}
9210 
9211 	/* Reset polling parameters */
9212 	ifnet_set_poll_cycle(ifp, NULL);
9213 	ifp->if_poll_update = 0;
9214 	ifp->if_poll_flags = 0;
9215 	ifp->if_poll_req = 0;
9216 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9217 
9218 	/*
9219 	 * If the driver supports the new receive model, create a poller
9220 	 * thread to invoke if_input_poll callback where the packets may
9221 	 * be dequeued from the driver and processed for reception.
9222 	 * if the interface is netif compat then the poller thread is
9223 	 * managed by netif.
9224 	 */
9225 	if (thfunc == dlil_rxpoll_input_thread_func) {
9226 		thread_precedence_policy_data_t info;
9227 		__unused kern_return_t kret;
9228 #if SKYWALK
9229 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9230 #endif /* SKYWALK */
9231 		VERIFY(ifp->if_input_poll != NULL);
9232 		VERIFY(ifp->if_input_ctl != NULL);
9233 		ifnet_incr_pending_thread_count(ifp);
9234 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9235 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
9236 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
9237 			    "err=%d", __func__, ifp, err);
9238 			/* NOTREACHED */
9239 		}
9240 		bzero(&info, sizeof(info));
9241 		info.importance = 1;
9242 		kret = thread_policy_set(ifp->if_poll_thread,
9243 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9244 		    THREAD_PRECEDENCE_POLICY_COUNT);
9245 		ASSERT(kret == KERN_SUCCESS);
9246 	}
9247 
9248 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9249 	VERIFY(ifp->if_desc.ifd_len == 0);
9250 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9251 
9252 	/* Record attach PC stacktrace */
9253 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9254 
9255 	ifp->if_updatemcasts = 0;
9256 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9257 		struct ifmultiaddr *ifma;
9258 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9259 			IFMA_LOCK(ifma);
9260 			if (ifma->ifma_addr->sa_family == AF_LINK ||
9261 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
9262 				ifp->if_updatemcasts++;
9263 			}
9264 			IFMA_UNLOCK(ifma);
9265 		}
9266 
9267 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9268 		    "membership(s)\n", if_name(ifp),
9269 		    ifp->if_updatemcasts);
9270 	}
9271 
9272 	/* Clear logging parameters */
9273 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9274 
9275 	/* Clear foreground/realtime activity timestamps */
9276 	ifp->if_fg_sendts = 0;
9277 	ifp->if_rt_sendts = 0;
9278 
9279 	/* Clear throughput estimates and radio type */
9280 	ifp->if_estimated_up_bucket = 0;
9281 	ifp->if_estimated_down_bucket = 0;
9282 	ifp->if_radio_type = 0;
9283 	ifp->if_radio_channel = 0;
9284 
9285 	VERIFY(ifp->if_delegated.ifp == NULL);
9286 	VERIFY(ifp->if_delegated.type == 0);
9287 	VERIFY(ifp->if_delegated.family == 0);
9288 	VERIFY(ifp->if_delegated.subfamily == 0);
9289 	VERIFY(ifp->if_delegated.expensive == 0);
9290 	VERIFY(ifp->if_delegated.constrained == 0);
9291 
9292 	VERIFY(ifp->if_agentids == NULL);
9293 	VERIFY(ifp->if_agentcount == 0);
9294 
9295 	/* Reset interface state */
9296 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9297 	ifp->if_interface_state.valid_bitmask |=
9298 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9299 	ifp->if_interface_state.interface_availability =
9300 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9301 
9302 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
9303 	if (ifp == lo_ifp) {
9304 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9305 		ifp->if_interface_state.valid_bitmask |=
9306 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
9307 	} else {
9308 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9309 	}
9310 
9311 	/*
9312 	 * Enable ECN capability on this interface depending on the
9313 	 * value of ECN global setting
9314 	 */
9315 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9316 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
9317 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9318 	}
9319 
9320 	/*
9321 	 * Built-in Cyclops always on policy for WiFi infra
9322 	 */
9323 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9324 		errno_t error;
9325 
9326 		error = if_set_qosmarking_mode(ifp,
9327 		    IFRTYPE_QOSMARKING_FASTLANE);
9328 		if (error != 0) {
9329 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9330 			    __func__, ifp->if_xname, error);
9331 		} else {
9332 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9333 #if (DEVELOPMENT || DEBUG)
9334 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9335 			    __func__, ifp->if_xname);
9336 #endif /* (DEVELOPMENT || DEBUG) */
9337 		}
9338 	}
9339 
9340 	ifnet_lock_done(ifp);
9341 	ifnet_head_done();
9342 
9343 #if SKYWALK
9344 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9345 #endif /* SKYWALK */
9346 
9347 	lck_mtx_lock(&ifp->if_cached_route_lock);
9348 	/* Enable forwarding cached route */
9349 	ifp->if_fwd_cacheok = 1;
9350 	/* Clean up any existing cached routes */
9351 	ROUTE_RELEASE(&ifp->if_fwd_route);
9352 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9353 	ROUTE_RELEASE(&ifp->if_src_route);
9354 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9355 	ROUTE_RELEASE(&ifp->if_src_route6);
9356 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9357 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9358 
9359 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9360 
9361 	/*
9362 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9363 	 * and trees; do this before the ifnet is marked as attached.
9364 	 * The ifnet keeps the reference to the info structures even after
9365 	 * the ifnet is detached, since the network-layer records still
9366 	 * refer to the info structures even after that.  This also
9367 	 * makes it possible for them to still function after the ifnet
9368 	 * is recycled or reattached.
9369 	 */
9370 #if INET
9371 	if (IGMP_IFINFO(ifp) == NULL) {
9372 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9373 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9374 	} else {
9375 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9376 		igmp_domifreattach(IGMP_IFINFO(ifp));
9377 	}
9378 #endif /* INET */
9379 	if (MLD_IFINFO(ifp) == NULL) {
9380 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9381 		VERIFY(MLD_IFINFO(ifp) != NULL);
9382 	} else {
9383 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9384 		mld_domifreattach(MLD_IFINFO(ifp));
9385 	}
9386 
9387 	VERIFY(ifp->if_data_threshold == 0);
9388 	VERIFY(ifp->if_dt_tcall != NULL);
9389 
9390 	/*
9391 	 * Wait for the created kernel threads for I/O to get
9392 	 * scheduled and run at least once before we proceed
9393 	 * to mark interface as attached.
9394 	 */
9395 	lck_mtx_lock(&ifp->if_ref_lock);
9396 	while (ifp->if_threads_pending != 0) {
9397 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9398 		    "interface %s to get scheduled at least once.\n",
9399 		    __func__, ifp->if_xname);
9400 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9401 		    __func__, NULL);
9402 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9403 	}
9404 	lck_mtx_unlock(&ifp->if_ref_lock);
9405 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9406 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9407 
9408 	/* Final mark this ifnet as attached. */
9409 	ifnet_lock_exclusive(ifp);
9410 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9411 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9412 	lck_mtx_unlock(&ifp->if_ref_lock);
9413 	if (net_rtref) {
9414 		/* boot-args override; enable idle notification */
9415 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9416 		    IFRF_IDLE_NOTIFY);
9417 	} else {
9418 		/* apply previous request(s) to set the idle flags, if any */
9419 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9420 		    ifp->if_idle_new_flags_mask);
9421 	}
9422 #if SKYWALK
9423 	/* the interface is fully attached; let the nexus adapter know */
9424 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9425 		if (netif_compat) {
9426 			if (sk_netif_compat_txmodel ==
9427 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9428 				ifnet_enqueue_multi_setup(ifp,
9429 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9430 			}
9431 			ifp->if_nx_netif = nexus_netif;
9432 		}
9433 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9434 	}
9435 #endif /* SKYWALK */
9436 	ifnet_lock_done(ifp);
9437 	dlil_if_unlock();
9438 
9439 #if PF
9440 	/*
9441 	 * Attach packet filter to this interface, if enabled.
9442 	 */
9443 	pf_ifnet_hook(ifp, 1);
9444 #endif /* PF */
9445 
9446 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9447 
9448 	if (dlil_verbose) {
9449 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9450 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9451 	}
9452 
9453 	return 0;
9454 }
9455 
9456 /*
9457  * Prepare the storage for the first/permanent link address, which must
9458  * must have the same lifetime as the ifnet itself.  Although the link
9459  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9460  * its location in memory must never change as it may still be referred
9461  * to by some parts of the system afterwards (unfortunate implementation
9462  * artifacts inherited from BSD.)
9463  *
9464  * Caller must hold ifnet lock as writer.
9465  */
9466 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9467 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9468 {
9469 	struct ifaddr *ifa, *oifa;
9470 	struct sockaddr_dl *asdl, *msdl;
9471 	char workbuf[IFNAMSIZ * 2];
9472 	int namelen, masklen, socksize;
9473 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9474 
9475 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9476 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9477 
9478 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9479 	    if_name(ifp));
9480 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9481 	    + ((namelen > 0) ? namelen : 0);
9482 	socksize = masklen + ifp->if_addrlen;
9483 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9484 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9485 		socksize = sizeof(struct sockaddr_dl);
9486 	}
9487 	socksize = ROUNDUP(socksize);
9488 #undef ROUNDUP
9489 
9490 	ifa = ifp->if_lladdr;
9491 	if (socksize > DLIL_SDLMAXLEN ||
9492 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9493 		/*
9494 		 * Rare, but in the event that the link address requires
9495 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9496 		 * largest possible storages for address and mask, such
9497 		 * that we can reuse the same space when if_addrlen grows.
9498 		 * This same space will be used when if_addrlen shrinks.
9499 		 */
9500 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9501 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9502 
9503 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9504 			ifa_lock_init(ifa);
9505 			/* Don't set IFD_ALLOC, as this is permanent */
9506 			ifa->ifa_debug = IFD_LINK;
9507 		}
9508 		IFA_LOCK(ifa);
9509 		/* address and mask sockaddr_dl locations */
9510 		asdl = (struct sockaddr_dl *)(ifa + 1);
9511 		bzero(asdl, SOCK_MAXADDRLEN);
9512 		msdl = (struct sockaddr_dl *)(void *)
9513 		    ((char *)asdl + SOCK_MAXADDRLEN);
9514 		bzero(msdl, SOCK_MAXADDRLEN);
9515 	} else {
9516 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9517 		/*
9518 		 * Use the storage areas for address and mask within the
9519 		 * dlil_ifnet structure.  This is the most common case.
9520 		 */
9521 		if (ifa == NULL) {
9522 			ifa = &dl_if->dl_if_lladdr.ifa;
9523 			ifa_lock_init(ifa);
9524 			/* Don't set IFD_ALLOC, as this is permanent */
9525 			ifa->ifa_debug = IFD_LINK;
9526 		}
9527 		IFA_LOCK(ifa);
9528 		/* address and mask sockaddr_dl locations */
9529 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9530 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9531 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9532 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9533 	}
9534 
9535 	/* hold a permanent reference for the ifnet itself */
9536 	IFA_ADDREF_LOCKED(ifa);
9537 	oifa = ifp->if_lladdr;
9538 	ifp->if_lladdr = ifa;
9539 
9540 	VERIFY(ifa->ifa_debug == IFD_LINK);
9541 	ifa->ifa_ifp = ifp;
9542 	ifa->ifa_rtrequest = link_rtrequest;
9543 	ifa->ifa_addr = (struct sockaddr *)asdl;
9544 	asdl->sdl_len = (u_char)socksize;
9545 	asdl->sdl_family = AF_LINK;
9546 	if (namelen > 0) {
9547 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9548 		    sizeof(asdl->sdl_data)));
9549 		asdl->sdl_nlen = (u_char)namelen;
9550 	} else {
9551 		asdl->sdl_nlen = 0;
9552 	}
9553 	asdl->sdl_index = ifp->if_index;
9554 	asdl->sdl_type = ifp->if_type;
9555 	if (ll_addr != NULL) {
9556 		asdl->sdl_alen = ll_addr->sdl_alen;
9557 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9558 	} else {
9559 		asdl->sdl_alen = 0;
9560 	}
9561 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9562 	msdl->sdl_len = (u_char)masklen;
9563 	while (namelen > 0) {
9564 		msdl->sdl_data[--namelen] = 0xff;
9565 	}
9566 	IFA_UNLOCK(ifa);
9567 
9568 	if (oifa != NULL) {
9569 		IFA_REMREF(oifa);
9570 	}
9571 
9572 	return ifa;
9573 }
9574 
9575 static void
if_purgeaddrs(struct ifnet * ifp)9576 if_purgeaddrs(struct ifnet *ifp)
9577 {
9578 #if INET
9579 	in_purgeaddrs(ifp);
9580 #endif /* INET */
9581 	in6_purgeaddrs(ifp);
9582 }
9583 
9584 errno_t
ifnet_detach(ifnet_t ifp)9585 ifnet_detach(ifnet_t ifp)
9586 {
9587 	struct ifnet *delegated_ifp;
9588 	struct nd_ifinfo *ndi = NULL;
9589 
9590 	if (ifp == NULL) {
9591 		return EINVAL;
9592 	}
9593 
9594 	ndi = ND_IFINFO(ifp);
9595 	if (NULL != ndi) {
9596 		ndi->cga_initialized = FALSE;
9597 	}
9598 
9599 	/* Mark the interface down */
9600 	if_down(ifp);
9601 
9602 	/*
9603 	 * IMPORTANT NOTE
9604 	 *
9605 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9606 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9607 	 * until after we've waited for all I/O references to drain
9608 	 * in ifnet_detach_final().
9609 	 */
9610 
9611 	ifnet_head_lock_exclusive();
9612 	ifnet_lock_exclusive(ifp);
9613 
9614 	if (ifp->if_output_netem != NULL) {
9615 		netem_destroy(ifp->if_output_netem);
9616 		ifp->if_output_netem = NULL;
9617 	}
9618 
9619 	/*
9620 	 * Check to see if this interface has previously triggered
9621 	 * aggressive protocol draining; if so, decrement the global
9622 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9623 	 * there are no more of such an interface around.
9624 	 */
9625 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9626 
9627 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9628 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9629 		lck_mtx_unlock(&ifp->if_ref_lock);
9630 		ifnet_lock_done(ifp);
9631 		ifnet_head_done();
9632 		return EINVAL;
9633 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9634 		/* Interface has already been detached */
9635 		lck_mtx_unlock(&ifp->if_ref_lock);
9636 		ifnet_lock_done(ifp);
9637 		ifnet_head_done();
9638 		return ENXIO;
9639 	}
9640 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9641 	/* Indicate this interface is being detached */
9642 	ifp->if_refflags &= ~IFRF_ATTACHED;
9643 	ifp->if_refflags |= IFRF_DETACHING;
9644 	lck_mtx_unlock(&ifp->if_ref_lock);
9645 
9646 	if (dlil_verbose) {
9647 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9648 	}
9649 
9650 	/* clean up flow control entry object if there's any */
9651 	if (ifp->if_eflags & IFEF_TXSTART) {
9652 		ifnet_flowadv(ifp->if_flowhash);
9653 	}
9654 
9655 	/* Reset ECN enable/disable flags */
9656 	/* Reset CLAT46 flag */
9657 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9658 
9659 	/*
9660 	 * We do not reset the TCP keep alive counters in case
9661 	 * a TCP connection stays connection after the interface
9662 	 * went down
9663 	 */
9664 	if (ifp->if_tcp_kao_cnt > 0) {
9665 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9666 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9667 	}
9668 	ifp->if_tcp_kao_max = 0;
9669 
9670 	/*
9671 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9672 	 * no longer be visible during lookups from this point.
9673 	 */
9674 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9675 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9676 	ifp->if_link.tqe_next = NULL;
9677 	ifp->if_link.tqe_prev = NULL;
9678 	if (ifp->if_ordered_link.tqe_next != NULL ||
9679 	    ifp->if_ordered_link.tqe_prev != NULL) {
9680 		ifnet_remove_from_ordered_list(ifp);
9681 	}
9682 	ifindex2ifnet[ifp->if_index] = NULL;
9683 
9684 	/* 18717626 - reset router mode */
9685 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9686 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9687 
9688 	/* Record detach PC stacktrace */
9689 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9690 
9691 	/* Clear logging parameters */
9692 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9693 
9694 	/* Clear delegated interface info (reference released below) */
9695 	delegated_ifp = ifp->if_delegated.ifp;
9696 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9697 
9698 	/* Reset interface state */
9699 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9700 
9701 	/*
9702 	 * Increment the generation count on interface deletion
9703 	 */
9704 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9705 
9706 	ifnet_lock_done(ifp);
9707 	ifnet_head_done();
9708 
9709 	/* Release reference held on the delegated interface */
9710 	if (delegated_ifp != NULL) {
9711 		ifnet_release(delegated_ifp);
9712 	}
9713 
9714 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9715 	if (ifp != lo_ifp) {
9716 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9717 	}
9718 
9719 	/* Reset TCP local statistics */
9720 	if (ifp->if_tcp_stat != NULL) {
9721 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9722 	}
9723 
9724 	/* Reset UDP local statistics */
9725 	if (ifp->if_udp_stat != NULL) {
9726 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9727 	}
9728 
9729 	/* Reset ifnet IPv4 stats */
9730 	if (ifp->if_ipv4_stat != NULL) {
9731 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9732 	}
9733 
9734 	/* Reset ifnet IPv6 stats */
9735 	if (ifp->if_ipv6_stat != NULL) {
9736 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9737 	}
9738 
9739 	/* Release memory held for interface link status report */
9740 	if (ifp->if_link_status != NULL) {
9741 		kfree_type(struct if_link_status, ifp->if_link_status);
9742 		ifp->if_link_status = NULL;
9743 	}
9744 
9745 	/* Disable forwarding cached route */
9746 	lck_mtx_lock(&ifp->if_cached_route_lock);
9747 	ifp->if_fwd_cacheok = 0;
9748 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9749 
9750 	/* Disable data threshold and wait for any pending event posting */
9751 	ifp->if_data_threshold = 0;
9752 	VERIFY(ifp->if_dt_tcall != NULL);
9753 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9754 
9755 	/*
9756 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9757 	 * references to the info structures and leave them attached to
9758 	 * this ifnet.
9759 	 */
9760 #if INET
9761 	igmp_domifdetach(ifp);
9762 #endif /* INET */
9763 	mld_domifdetach(ifp);
9764 
9765 #if SKYWALK
9766 	/* Clean up any netns tokens still pointing to to this ifnet */
9767 	netns_ifnet_detach(ifp);
9768 #endif /* SKYWALK */
9769 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9770 
9771 	/* Let worker thread take care of the rest, to avoid reentrancy */
9772 	dlil_if_lock();
9773 	ifnet_detaching_enqueue(ifp);
9774 	dlil_if_unlock();
9775 
9776 	return 0;
9777 }
9778 
9779 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9780 ifnet_detaching_enqueue(struct ifnet *ifp)
9781 {
9782 	dlil_if_lock_assert();
9783 
9784 	++ifnet_detaching_cnt;
9785 	VERIFY(ifnet_detaching_cnt != 0);
9786 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9787 	wakeup((caddr_t)&ifnet_delayed_run);
9788 }
9789 
9790 static struct ifnet *
ifnet_detaching_dequeue(void)9791 ifnet_detaching_dequeue(void)
9792 {
9793 	struct ifnet *ifp;
9794 
9795 	dlil_if_lock_assert();
9796 
9797 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9798 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9799 	if (ifp != NULL) {
9800 		VERIFY(ifnet_detaching_cnt != 0);
9801 		--ifnet_detaching_cnt;
9802 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9803 		ifp->if_detaching_link.tqe_next = NULL;
9804 		ifp->if_detaching_link.tqe_prev = NULL;
9805 	}
9806 	return ifp;
9807 }
9808 
9809 __attribute__((noreturn))
9810 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9811 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9812 {
9813 #pragma unused(v, wres)
9814 	struct ifnet *ifp;
9815 
9816 	dlil_if_lock();
9817 	if (__improbable(ifnet_detaching_embryonic)) {
9818 		ifnet_detaching_embryonic = FALSE;
9819 		/* there's no lock ordering constrain so OK to do this here */
9820 		dlil_decr_pending_thread_count();
9821 	}
9822 
9823 	for (;;) {
9824 		dlil_if_lock_assert();
9825 
9826 		if (ifnet_detaching_cnt == 0) {
9827 			break;
9828 		}
9829 
9830 		net_update_uptime();
9831 
9832 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9833 
9834 		/* Take care of detaching ifnet */
9835 		ifp = ifnet_detaching_dequeue();
9836 		if (ifp != NULL) {
9837 			dlil_if_unlock();
9838 			ifnet_detach_final(ifp);
9839 			dlil_if_lock();
9840 		}
9841 	}
9842 
9843 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9844 	dlil_if_unlock();
9845 	(void) thread_block(ifnet_detacher_thread_cont);
9846 
9847 	VERIFY(0);      /* we should never get here */
9848 	/* NOTREACHED */
9849 	__builtin_unreachable();
9850 }
9851 
9852 __dead2
9853 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9854 ifnet_detacher_thread_func(void *v, wait_result_t w)
9855 {
9856 #pragma unused(v, w)
9857 	dlil_if_lock();
9858 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9859 	ifnet_detaching_embryonic = TRUE;
9860 	/* wake up once to get out of embryonic state */
9861 	wakeup((caddr_t)&ifnet_delayed_run);
9862 	dlil_if_unlock();
9863 	(void) thread_block(ifnet_detacher_thread_cont);
9864 	VERIFY(0);
9865 	/* NOTREACHED */
9866 	__builtin_unreachable();
9867 }
9868 
9869 static void
ifnet_detach_final(struct ifnet * ifp)9870 ifnet_detach_final(struct ifnet *ifp)
9871 {
9872 	struct ifnet_filter *filter, *filter_next;
9873 	struct dlil_ifnet *dlifp;
9874 	struct ifnet_filter_head fhead;
9875 	struct dlil_threading_info *inp;
9876 	struct ifaddr *ifa;
9877 	ifnet_detached_func if_free;
9878 	int i;
9879 
9880 	/* Let BPF know we're detaching */
9881 	bpfdetach(ifp);
9882 
9883 #if SKYWALK
9884 	dlil_netif_detach_notify(ifp);
9885 	/*
9886 	 * Wait for the datapath to quiesce before tearing down
9887 	 * netif/flowswitch nexuses.
9888 	 */
9889 	dlil_quiesce_and_detach_nexuses(ifp);
9890 #endif /* SKYWALK */
9891 
9892 	lck_mtx_lock(&ifp->if_ref_lock);
9893 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9894 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9895 		    __func__, ifp);
9896 		/* NOTREACHED */
9897 	}
9898 
9899 	/*
9900 	 * Wait until the existing IO references get released
9901 	 * before we proceed with ifnet_detach.  This is not a
9902 	 * common case, so block without using a continuation.
9903 	 */
9904 	while (ifp->if_refio > 0) {
9905 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9906 		    "to be released\n", __func__, if_name(ifp));
9907 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9908 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9909 	}
9910 
9911 	VERIFY(ifp->if_datamov == 0);
9912 	VERIFY(ifp->if_drainers == 0);
9913 	VERIFY(ifp->if_suspend == 0);
9914 	ifp->if_refflags &= ~IFRF_READY;
9915 	lck_mtx_unlock(&ifp->if_ref_lock);
9916 
9917 	/* Clear agent IDs */
9918 	if (ifp->if_agentids != NULL) {
9919 		kfree_data(ifp->if_agentids,
9920 		    sizeof(uuid_t) * ifp->if_agentcount);
9921 		ifp->if_agentids = NULL;
9922 	}
9923 	ifp->if_agentcount = 0;
9924 
9925 #if SKYWALK
9926 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9927 #endif /* SKYWALK */
9928 	/* Drain and destroy send queue */
9929 	ifclassq_teardown(ifp->if_snd);
9930 
9931 	/* Detach interface filters */
9932 	lck_mtx_lock(&ifp->if_flt_lock);
9933 	if_flt_monitor_enter(ifp);
9934 
9935 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9936 	fhead = ifp->if_flt_head;
9937 	TAILQ_INIT(&ifp->if_flt_head);
9938 
9939 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9940 		filter_next = TAILQ_NEXT(filter, filt_next);
9941 		lck_mtx_unlock(&ifp->if_flt_lock);
9942 
9943 		dlil_detach_filter_internal(filter, 1);
9944 		lck_mtx_lock(&ifp->if_flt_lock);
9945 	}
9946 	if_flt_monitor_leave(ifp);
9947 	lck_mtx_unlock(&ifp->if_flt_lock);
9948 
9949 	/* Tell upper layers to drop their network addresses */
9950 	if_purgeaddrs(ifp);
9951 
9952 	ifnet_lock_exclusive(ifp);
9953 
9954 	/* Unplumb all protocols */
9955 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9956 		struct if_proto *proto;
9957 
9958 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9959 		while (proto != NULL) {
9960 			protocol_family_t family = proto->protocol_family;
9961 			ifnet_lock_done(ifp);
9962 			proto_unplumb(family, ifp);
9963 			ifnet_lock_exclusive(ifp);
9964 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9965 		}
9966 		/* There should not be any protocols left */
9967 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9968 	}
9969 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9970 	ifp->if_proto_hash = NULL;
9971 
9972 	/* Detach (permanent) link address from if_addrhead */
9973 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9974 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9975 	IFA_LOCK(ifa);
9976 	if_detach_link_ifa(ifp, ifa);
9977 	IFA_UNLOCK(ifa);
9978 
9979 	/* Remove (permanent) link address from ifnet_addrs[] */
9980 	IFA_REMREF(ifa);
9981 	ifnet_addrs[ifp->if_index - 1] = NULL;
9982 
9983 	/* This interface should not be on {ifnet_head,detaching} */
9984 	VERIFY(ifp->if_link.tqe_next == NULL);
9985 	VERIFY(ifp->if_link.tqe_prev == NULL);
9986 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9987 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9988 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9989 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9990 
9991 	/* The slot should have been emptied */
9992 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9993 
9994 	/* There should not be any addresses left */
9995 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9996 
9997 	/*
9998 	 * Signal the starter thread to terminate itself, and wait until
9999 	 * it has exited.
10000 	 */
10001 	if (ifp->if_start_thread != THREAD_NULL) {
10002 		lck_mtx_lock_spin(&ifp->if_start_lock);
10003 		ifp->if_start_flags |= IFSF_TERMINATING;
10004 		wakeup_one((caddr_t)&ifp->if_start_thread);
10005 		lck_mtx_unlock(&ifp->if_start_lock);
10006 
10007 		/* wait for starter thread to terminate */
10008 		lck_mtx_lock(&ifp->if_start_lock);
10009 		while (ifp->if_start_thread != THREAD_NULL) {
10010 			if (dlil_verbose) {
10011 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10012 				    __func__,
10013 				    if_name(ifp));
10014 			}
10015 			(void) msleep(&ifp->if_start_thread,
10016 			    &ifp->if_start_lock, (PZERO - 1),
10017 			    "ifnet_start_thread_exit", NULL);
10018 		}
10019 		lck_mtx_unlock(&ifp->if_start_lock);
10020 		if (dlil_verbose) {
10021 			DLIL_PRINTF("%s: %s starter thread termination complete",
10022 			    __func__, if_name(ifp));
10023 		}
10024 	}
10025 
10026 	/*
10027 	 * Signal the poller thread to terminate itself, and wait until
10028 	 * it has exited.
10029 	 */
10030 	if (ifp->if_poll_thread != THREAD_NULL) {
10031 #if SKYWALK
10032 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10033 #endif /* SKYWALK */
10034 		lck_mtx_lock_spin(&ifp->if_poll_lock);
10035 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10036 		wakeup_one((caddr_t)&ifp->if_poll_thread);
10037 		lck_mtx_unlock(&ifp->if_poll_lock);
10038 
10039 		/* wait for poller thread to terminate */
10040 		lck_mtx_lock(&ifp->if_poll_lock);
10041 		while (ifp->if_poll_thread != THREAD_NULL) {
10042 			if (dlil_verbose) {
10043 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10044 				    __func__,
10045 				    if_name(ifp));
10046 			}
10047 			(void) msleep(&ifp->if_poll_thread,
10048 			    &ifp->if_poll_lock, (PZERO - 1),
10049 			    "ifnet_poll_thread_exit", NULL);
10050 		}
10051 		lck_mtx_unlock(&ifp->if_poll_lock);
10052 		if (dlil_verbose) {
10053 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
10054 			    __func__, if_name(ifp));
10055 		}
10056 	}
10057 
10058 	/*
10059 	 * If thread affinity was set for the workloop thread, we will need
10060 	 * to tear down the affinity and release the extra reference count
10061 	 * taken at attach time.  Does not apply to lo0 or other interfaces
10062 	 * without dedicated input threads.
10063 	 */
10064 	if ((inp = ifp->if_inp) != NULL) {
10065 		VERIFY(inp != dlil_main_input_thread);
10066 
10067 		if (inp->dlth_affinity) {
10068 			struct thread *tp, *wtp, *ptp;
10069 
10070 			lck_mtx_lock_spin(&inp->dlth_lock);
10071 			wtp = inp->dlth_driver_thread;
10072 			inp->dlth_driver_thread = THREAD_NULL;
10073 			ptp = inp->dlth_poller_thread;
10074 			inp->dlth_poller_thread = THREAD_NULL;
10075 			ASSERT(inp->dlth_thread != THREAD_NULL);
10076 			tp = inp->dlth_thread;    /* don't nullify now */
10077 			inp->dlth_affinity_tag = 0;
10078 			inp->dlth_affinity = FALSE;
10079 			lck_mtx_unlock(&inp->dlth_lock);
10080 
10081 			/* Tear down poll thread affinity */
10082 			if (ptp != NULL) {
10083 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10084 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
10085 				(void) dlil_affinity_set(ptp,
10086 				    THREAD_AFFINITY_TAG_NULL);
10087 				thread_deallocate(ptp);
10088 			}
10089 
10090 			/* Tear down workloop thread affinity */
10091 			if (wtp != NULL) {
10092 				(void) dlil_affinity_set(wtp,
10093 				    THREAD_AFFINITY_TAG_NULL);
10094 				thread_deallocate(wtp);
10095 			}
10096 
10097 			/* Tear down DLIL input thread affinity */
10098 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10099 			thread_deallocate(tp);
10100 		}
10101 
10102 		/* disassociate ifp DLIL input thread */
10103 		ifp->if_inp = NULL;
10104 
10105 		/* if the worker thread was created, tell it to terminate */
10106 		if (inp->dlth_thread != THREAD_NULL) {
10107 			lck_mtx_lock_spin(&inp->dlth_lock);
10108 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10109 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10110 				wakeup_one((caddr_t)&inp->dlth_flags);
10111 			}
10112 			lck_mtx_unlock(&inp->dlth_lock);
10113 			ifnet_lock_done(ifp);
10114 
10115 			/* wait for the input thread to terminate */
10116 			lck_mtx_lock_spin(&inp->dlth_lock);
10117 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10118 			    == 0) {
10119 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10120 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10121 			}
10122 			lck_mtx_unlock(&inp->dlth_lock);
10123 			ifnet_lock_exclusive(ifp);
10124 		}
10125 
10126 		/* clean-up input thread state */
10127 		dlil_clean_threading_info(inp);
10128 		/* clean-up poll parameters */
10129 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
10130 		dlil_reset_rxpoll_params(ifp);
10131 	}
10132 
10133 	/* The driver might unload, so point these to ourselves */
10134 	if_free = ifp->if_free;
10135 	ifp->if_output_dlil = ifp_if_output;
10136 	ifp->if_output = ifp_if_output;
10137 	ifp->if_pre_enqueue = ifp_if_output;
10138 	ifp->if_start = ifp_if_start;
10139 	ifp->if_output_ctl = ifp_if_ctl;
10140 	ifp->if_input_dlil = ifp_if_input;
10141 	ifp->if_input_poll = ifp_if_input_poll;
10142 	ifp->if_input_ctl = ifp_if_ctl;
10143 	ifp->if_ioctl = ifp_if_ioctl;
10144 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10145 	ifp->if_free = ifp_if_free;
10146 	ifp->if_demux = ifp_if_demux;
10147 	ifp->if_event = ifp_if_event;
10148 	ifp->if_framer_legacy = ifp_if_framer;
10149 	ifp->if_framer = ifp_if_framer_extended;
10150 	ifp->if_add_proto = ifp_if_add_proto;
10151 	ifp->if_del_proto = ifp_if_del_proto;
10152 	ifp->if_check_multi = ifp_if_check_multi;
10153 
10154 	/* wipe out interface description */
10155 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10156 	ifp->if_desc.ifd_len = 0;
10157 	VERIFY(ifp->if_desc.ifd_desc != NULL);
10158 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10159 
10160 	/* there shouldn't be any delegation by now */
10161 	VERIFY(ifp->if_delegated.ifp == NULL);
10162 	VERIFY(ifp->if_delegated.type == 0);
10163 	VERIFY(ifp->if_delegated.family == 0);
10164 	VERIFY(ifp->if_delegated.subfamily == 0);
10165 	VERIFY(ifp->if_delegated.expensive == 0);
10166 	VERIFY(ifp->if_delegated.constrained == 0);
10167 
10168 	/* QoS marking get cleared */
10169 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10170 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10171 
10172 #if SKYWALK
10173 	/* the nexus destructor is responsible for clearing these */
10174 	VERIFY(ifp->if_na_ops == NULL);
10175 	VERIFY(ifp->if_na == NULL);
10176 #endif /* SKYWALK */
10177 
10178 	/* promiscuous/allmulti counts need to start at zero again */
10179 	ifp->if_pcount = 0;
10180 	ifp->if_amcount = 0;
10181 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10182 
10183 	ifnet_lock_done(ifp);
10184 
10185 #if PF
10186 	/*
10187 	 * Detach this interface from packet filter, if enabled.
10188 	 */
10189 	pf_ifnet_hook(ifp, 0);
10190 #endif /* PF */
10191 
10192 	/* Filter list should be empty */
10193 	lck_mtx_lock_spin(&ifp->if_flt_lock);
10194 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10195 	VERIFY(ifp->if_flt_busy == 0);
10196 	VERIFY(ifp->if_flt_waiters == 0);
10197 	VERIFY(ifp->if_flt_non_os_count == 0);
10198 	VERIFY(ifp->if_flt_no_tso_count == 0);
10199 	lck_mtx_unlock(&ifp->if_flt_lock);
10200 
10201 	/* Last chance to drain send queue */
10202 	if_qflush_snd(ifp, 0);
10203 
10204 	/* Last chance to cleanup any cached route */
10205 	lck_mtx_lock(&ifp->if_cached_route_lock);
10206 	VERIFY(!ifp->if_fwd_cacheok);
10207 	ROUTE_RELEASE(&ifp->if_fwd_route);
10208 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10209 	ROUTE_RELEASE(&ifp->if_src_route);
10210 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10211 	ROUTE_RELEASE(&ifp->if_src_route6);
10212 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10213 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10214 
10215 	VERIFY(ifp->if_data_threshold == 0);
10216 	VERIFY(ifp->if_dt_tcall != NULL);
10217 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10218 
10219 	ifnet_llreach_ifdetach(ifp);
10220 
10221 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10222 
10223 	/*
10224 	 * Finally, mark this ifnet as detached.
10225 	 */
10226 	if (dlil_verbose) {
10227 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
10228 	}
10229 	lck_mtx_lock_spin(&ifp->if_ref_lock);
10230 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
10231 		panic("%s: flags mismatch (detaching not set) ifp=%p",
10232 		    __func__, ifp);
10233 		/* NOTREACHED */
10234 	}
10235 	ifp->if_refflags &= ~IFRF_DETACHING;
10236 	lck_mtx_unlock(&ifp->if_ref_lock);
10237 	if (if_free != NULL) {
10238 		if_free(ifp);
10239 	}
10240 
10241 	ifclassq_release(&ifp->if_snd);
10242 
10243 	/* we're fully detached, clear the "in use" bit */
10244 	dlifp = (struct dlil_ifnet *)ifp;
10245 	lck_mtx_lock(&dlifp->dl_if_lock);
10246 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10247 	dlifp->dl_if_flags &= ~DLIF_INUSE;
10248 	lck_mtx_unlock(&dlifp->dl_if_lock);
10249 
10250 	/* Release reference held during ifnet attach */
10251 	ifnet_release(ifp);
10252 }
10253 
10254 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10255 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10256 {
10257 #pragma unused(ifp)
10258 	m_freem_list(m);
10259 	return 0;
10260 }
10261 
10262 void
ifp_if_start(struct ifnet * ifp)10263 ifp_if_start(struct ifnet *ifp)
10264 {
10265 	ifnet_purge(ifp);
10266 }
10267 
10268 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10269 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10270     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10271     boolean_t poll, struct thread *tp)
10272 {
10273 #pragma unused(ifp, m_tail, s, poll, tp)
10274 	m_freem_list(m_head);
10275 	return ENXIO;
10276 }
10277 
10278 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10279 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10280     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10281 {
10282 #pragma unused(ifp, flags, max_cnt)
10283 	if (m_head != NULL) {
10284 		*m_head = NULL;
10285 	}
10286 	if (m_tail != NULL) {
10287 		*m_tail = NULL;
10288 	}
10289 	if (cnt != NULL) {
10290 		*cnt = 0;
10291 	}
10292 	if (len != NULL) {
10293 		*len = 0;
10294 	}
10295 }
10296 
10297 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10298 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10299 {
10300 #pragma unused(ifp, cmd, arglen, arg)
10301 	return EOPNOTSUPP;
10302 }
10303 
10304 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10305 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10306 {
10307 #pragma unused(ifp, fh, pf)
10308 	m_freem(m);
10309 	return EJUSTRETURN;
10310 }
10311 
10312 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10313 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10314     const struct ifnet_demux_desc *da, u_int32_t dc)
10315 {
10316 #pragma unused(ifp, pf, da, dc)
10317 	return EINVAL;
10318 }
10319 
10320 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10321 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10322 {
10323 #pragma unused(ifp, pf)
10324 	return EINVAL;
10325 }
10326 
10327 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10328 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10329 {
10330 #pragma unused(ifp, sa)
10331 	return EOPNOTSUPP;
10332 }
10333 
10334 #if !XNU_TARGET_OS_OSX
10335 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10336 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10337     const struct sockaddr *sa, const char *ll, const char *t,
10338     u_int32_t *pre, u_int32_t *post)
10339 #else /* XNU_TARGET_OS_OSX */
10340 static errno_t
10341 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10342     const struct sockaddr *sa, const char *ll, const char *t)
10343 #endif /* XNU_TARGET_OS_OSX */
10344 {
10345 #pragma unused(ifp, m, sa, ll, t)
10346 #if !XNU_TARGET_OS_OSX
10347 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10348 #else /* XNU_TARGET_OS_OSX */
10349 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10350 #endif /* XNU_TARGET_OS_OSX */
10351 }
10352 
10353 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10354 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10355     const struct sockaddr *sa, const char *ll, const char *t,
10356     u_int32_t *pre, u_int32_t *post)
10357 {
10358 #pragma unused(ifp, sa, ll, t)
10359 	m_freem(*m);
10360 	*m = NULL;
10361 
10362 	if (pre != NULL) {
10363 		*pre = 0;
10364 	}
10365 	if (post != NULL) {
10366 		*post = 0;
10367 	}
10368 
10369 	return EJUSTRETURN;
10370 }
10371 
10372 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10373 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10374 {
10375 #pragma unused(ifp, cmd, arg)
10376 	return EOPNOTSUPP;
10377 }
10378 
10379 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10380 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10381 {
10382 #pragma unused(ifp, tm, f)
10383 	/* XXX not sure what to do here */
10384 	return 0;
10385 }
10386 
10387 static void
ifp_if_free(struct ifnet * ifp)10388 ifp_if_free(struct ifnet *ifp)
10389 {
10390 #pragma unused(ifp)
10391 }
10392 
10393 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10394 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10395 {
10396 #pragma unused(ifp, e)
10397 }
10398 
10399 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10400 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10401     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10402 {
10403 	struct ifnet *ifp1 = NULL;
10404 	struct dlil_ifnet *dlifp1 = NULL;
10405 	struct dlil_ifnet *dlifp1_saved = NULL;
10406 	void *buf, *base, **pbuf;
10407 	int ret = 0;
10408 
10409 	VERIFY(*ifp == NULL);
10410 	dlil_if_lock();
10411 	/*
10412 	 * We absolutely can't have an interface with the same name
10413 	 * in in-use state.
10414 	 * To make sure of that list has to be traversed completely
10415 	 */
10416 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10417 		ifp1 = (struct ifnet *)dlifp1;
10418 
10419 		if (ifp1->if_family != family) {
10420 			continue;
10421 		}
10422 
10423 		/*
10424 		 * If interface is in use, return EBUSY if either unique id
10425 		 * or interface extended names are the same
10426 		 */
10427 		lck_mtx_lock(&dlifp1->dl_if_lock);
10428 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10429 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10430 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10431 			ret = EBUSY;
10432 			goto end;
10433 		}
10434 
10435 		if (uniqueid_len != 0 &&
10436 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10437 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10438 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10439 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10440 				ret = EBUSY;
10441 				goto end;
10442 			}
10443 			if (dlifp1_saved == NULL) {
10444 				/* cache the first match */
10445 				dlifp1_saved = dlifp1;
10446 			}
10447 			/*
10448 			 * Do not break or jump to end as we have to traverse
10449 			 * the whole list to ensure there are no name collisions
10450 			 */
10451 		}
10452 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10453 	}
10454 
10455 	/* If there's an interface that can be recycled, use that */
10456 	if (dlifp1_saved != NULL) {
10457 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10458 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10459 			/* some other thread got in ahead of us */
10460 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10461 			ret = EBUSY;
10462 			goto end;
10463 		}
10464 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10465 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10466 		*ifp = (struct ifnet *)dlifp1_saved;
10467 		dlil_if_ref(*ifp);
10468 		goto end;
10469 	}
10470 
10471 	/* no interface found, allocate a new one */
10472 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10473 
10474 	/* Get the 64-bit aligned base address for this object */
10475 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10476 	    sizeof(u_int64_t));
10477 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10478 
10479 	/*
10480 	 * Wind back a pointer size from the aligned base and
10481 	 * save the original address so we can free it later.
10482 	 */
10483 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10484 	*pbuf = buf;
10485 	dlifp1 = base;
10486 
10487 	if (uniqueid_len) {
10488 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10489 		    Z_WAITOK);
10490 		if (dlifp1->dl_if_uniqueid == NULL) {
10491 			zfree(dlif_zone, buf);
10492 			ret = ENOMEM;
10493 			goto end;
10494 		}
10495 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10496 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10497 	}
10498 
10499 	ifp1 = (struct ifnet *)dlifp1;
10500 	dlifp1->dl_if_flags = DLIF_INUSE;
10501 	if (ifnet_debug) {
10502 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10503 		dlifp1->dl_if_trace = dlil_if_trace;
10504 	}
10505 	ifp1->if_name = dlifp1->dl_if_namestorage;
10506 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10507 
10508 	/* initialize interface description */
10509 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10510 	ifp1->if_desc.ifd_len = 0;
10511 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10512 
10513 #if SKYWALK
10514 	SLIST_INIT(&ifp1->if_netns_tokens);
10515 #endif /* SKYWALK */
10516 
10517 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10518 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10519 		    "error: %d\n", __func__, ret);
10520 		/* This probably shouldn't be fatal */
10521 		ret = 0;
10522 	}
10523 
10524 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10525 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10526 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10527 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10528 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10529 	    &ifnet_lock_attr);
10530 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10531 #if INET
10532 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10533 	    &ifnet_lock_attr);
10534 	ifp1->if_inetdata = NULL;
10535 #endif
10536 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10537 	ifp1->if_inet6_ioctl_busy = FALSE;
10538 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10539 	    &ifnet_lock_attr);
10540 	ifp1->if_inet6data = NULL;
10541 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10542 	    &ifnet_lock_attr);
10543 	ifp1->if_link_status = NULL;
10544 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10545 
10546 	/* for send data paths */
10547 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10548 	    &ifnet_lock_attr);
10549 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10550 	    &ifnet_lock_attr);
10551 
10552 	/* for receive data paths */
10553 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10554 	    &ifnet_lock_attr);
10555 
10556 	/* thread call allocation is done with sleeping zalloc */
10557 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10558 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10559 	if (ifp1->if_dt_tcall == NULL) {
10560 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10561 		/* NOTREACHED */
10562 	}
10563 
10564 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10565 
10566 	*ifp = ifp1;
10567 	dlil_if_ref(*ifp);
10568 
10569 end:
10570 	dlil_if_unlock();
10571 
10572 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10573 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10574 
10575 	return ret;
10576 }
10577 
10578 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10579 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10580 {
10581 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10582 
10583 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10584 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10585 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10586 	}
10587 
10588 	ifnet_lock_exclusive(ifp);
10589 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10590 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10591 		ifp->if_broadcast.length = 0;
10592 		ifp->if_broadcast.u.ptr = NULL;
10593 	}
10594 	lck_mtx_lock(&dlifp->dl_if_lock);
10595 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10596 	ifp->if_name = dlifp->dl_if_namestorage;
10597 	/* Reset external name (name + unit) */
10598 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10599 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10600 	    "%s?", ifp->if_name);
10601 	if (clear_in_use) {
10602 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10603 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10604 	}
10605 	lck_mtx_unlock(&dlifp->dl_if_lock);
10606 	ifnet_lock_done(ifp);
10607 }
10608 
10609 __private_extern__ void
dlil_if_release(ifnet_t ifp)10610 dlil_if_release(ifnet_t ifp)
10611 {
10612 	_dlil_if_release(ifp, false);
10613 }
10614 
10615 __private_extern__ void
dlil_if_lock(void)10616 dlil_if_lock(void)
10617 {
10618 	lck_mtx_lock(&dlil_ifnet_lock);
10619 }
10620 
10621 __private_extern__ void
dlil_if_unlock(void)10622 dlil_if_unlock(void)
10623 {
10624 	lck_mtx_unlock(&dlil_ifnet_lock);
10625 }
10626 
10627 __private_extern__ void
dlil_if_lock_assert(void)10628 dlil_if_lock_assert(void)
10629 {
10630 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10631 }
10632 
10633 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10634 dlil_proto_unplumb_all(struct ifnet *ifp)
10635 {
10636 	/*
10637 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10638 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10639 	 * explicit unplumb.
10640 	 *
10641 	 * if_proto_hash[3] is for other protocols; we expect anything
10642 	 * in this bucket to respond to the DETACHING event (which would
10643 	 * have happened by now) and do the unplumb then.
10644 	 */
10645 	(void) proto_unplumb(PF_INET, ifp);
10646 	(void) proto_unplumb(PF_INET6, ifp);
10647 }
10648 
10649 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10650 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10651 {
10652 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10653 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10654 
10655 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10656 
10657 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10658 }
10659 
10660 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10661 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10662 {
10663 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10664 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10665 
10666 	if (ifp->if_fwd_cacheok) {
10667 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10668 	} else {
10669 		ROUTE_RELEASE(src);
10670 	}
10671 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10672 }
10673 
10674 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10675 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10676 {
10677 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10678 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10679 
10680 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10681 	    sizeof(*dst));
10682 
10683 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10684 }
10685 
10686 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10687 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10688 {
10689 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10690 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10691 
10692 	if (ifp->if_fwd_cacheok) {
10693 		route_copyin((struct route *)src,
10694 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10695 	} else {
10696 		ROUTE_RELEASE(src);
10697 	}
10698 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10699 }
10700 
10701 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10702 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10703 {
10704 	struct route            src_rt;
10705 	struct sockaddr_in      *dst;
10706 
10707 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10708 
10709 	ifp_src_route_copyout(ifp, &src_rt);
10710 
10711 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10712 		ROUTE_RELEASE(&src_rt);
10713 		if (dst->sin_family != AF_INET) {
10714 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10715 			dst->sin_len = sizeof(src_rt.ro_dst);
10716 			dst->sin_family = AF_INET;
10717 		}
10718 		dst->sin_addr = src_ip;
10719 
10720 		VERIFY(src_rt.ro_rt == NULL);
10721 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10722 		    0, 0, ifp->if_index);
10723 
10724 		if (src_rt.ro_rt != NULL) {
10725 			/* retain a ref, copyin consumes one */
10726 			struct rtentry  *rte = src_rt.ro_rt;
10727 			RT_ADDREF(rte);
10728 			ifp_src_route_copyin(ifp, &src_rt);
10729 			src_rt.ro_rt = rte;
10730 		}
10731 	}
10732 
10733 	return src_rt.ro_rt;
10734 }
10735 
10736 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10737 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10738 {
10739 	struct route_in6 src_rt;
10740 
10741 	ifp_src_route6_copyout(ifp, &src_rt);
10742 
10743 	if (ROUTE_UNUSABLE(&src_rt) ||
10744 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10745 		ROUTE_RELEASE(&src_rt);
10746 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10747 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10748 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10749 			src_rt.ro_dst.sin6_family = AF_INET6;
10750 		}
10751 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10752 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10753 		    sizeof(src_rt.ro_dst.sin6_addr));
10754 
10755 		if (src_rt.ro_rt == NULL) {
10756 			src_rt.ro_rt = rtalloc1_scoped(
10757 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10758 				ifp->if_index);
10759 
10760 			if (src_rt.ro_rt != NULL) {
10761 				/* retain a ref, copyin consumes one */
10762 				struct rtentry  *rte = src_rt.ro_rt;
10763 				RT_ADDREF(rte);
10764 				ifp_src_route6_copyin(ifp, &src_rt);
10765 				src_rt.ro_rt = rte;
10766 			}
10767 		}
10768 	}
10769 
10770 	return src_rt.ro_rt;
10771 }
10772 
10773 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10774 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10775 {
10776 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10777 
10778 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10779 
10780 	/* Normalize to edge */
10781 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10782 		lqm = IFNET_LQM_THRESH_ABORT;
10783 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10784 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10785 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10786 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10787 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10788 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10789 	    lqm <= IFNET_LQM_THRESH_POOR) {
10790 		lqm = IFNET_LQM_THRESH_POOR;
10791 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10792 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10793 		lqm = IFNET_LQM_THRESH_GOOD;
10794 	}
10795 
10796 	/*
10797 	 * Take the lock if needed
10798 	 */
10799 	if (!locked) {
10800 		ifnet_lock_exclusive(ifp);
10801 	}
10802 
10803 	if (lqm == ifp->if_interface_state.lqm_state &&
10804 	    (ifp->if_interface_state.valid_bitmask &
10805 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10806 		/*
10807 		 * Release the lock if was not held by the caller
10808 		 */
10809 		if (!locked) {
10810 			ifnet_lock_done(ifp);
10811 		}
10812 		return;         /* nothing to update */
10813 	}
10814 	ifp->if_interface_state.valid_bitmask |=
10815 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10816 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10817 
10818 	/*
10819 	 * Don't want to hold the lock when issuing kernel events
10820 	 */
10821 	ifnet_lock_done(ifp);
10822 
10823 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10824 	ev_lqm_data.link_quality_metric = lqm;
10825 
10826 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10827 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10828 
10829 	/*
10830 	 * Reacquire the lock for the caller
10831 	 */
10832 	if (locked) {
10833 		ifnet_lock_exclusive(ifp);
10834 	}
10835 }
10836 
10837 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10838 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10839 {
10840 	struct kev_dl_rrc_state kev;
10841 
10842 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10843 	    (ifp->if_interface_state.valid_bitmask &
10844 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10845 		return;
10846 	}
10847 
10848 	ifp->if_interface_state.valid_bitmask |=
10849 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10850 
10851 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10852 
10853 	/*
10854 	 * Don't want to hold the lock when issuing kernel events
10855 	 */
10856 	ifnet_lock_done(ifp);
10857 
10858 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10859 	kev.rrc_state = rrc_state;
10860 
10861 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10862 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10863 
10864 	ifnet_lock_exclusive(ifp);
10865 }
10866 
10867 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10868 if_state_update(struct ifnet *ifp,
10869     struct if_interface_state *if_interface_state)
10870 {
10871 	u_short if_index_available = 0;
10872 
10873 	ifnet_lock_exclusive(ifp);
10874 
10875 	if ((ifp->if_type != IFT_CELLULAR) &&
10876 	    (if_interface_state->valid_bitmask &
10877 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10878 		ifnet_lock_done(ifp);
10879 		return ENOTSUP;
10880 	}
10881 	if ((if_interface_state->valid_bitmask &
10882 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10883 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10884 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10885 		ifnet_lock_done(ifp);
10886 		return EINVAL;
10887 	}
10888 	if ((if_interface_state->valid_bitmask &
10889 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10890 	    if_interface_state->rrc_state !=
10891 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10892 	    if_interface_state->rrc_state !=
10893 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10894 		ifnet_lock_done(ifp);
10895 		return EINVAL;
10896 	}
10897 
10898 	if (if_interface_state->valid_bitmask &
10899 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10900 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10901 	}
10902 	if (if_interface_state->valid_bitmask &
10903 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10904 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10905 	}
10906 	if (if_interface_state->valid_bitmask &
10907 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10908 		ifp->if_interface_state.valid_bitmask |=
10909 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10910 		ifp->if_interface_state.interface_availability =
10911 		    if_interface_state->interface_availability;
10912 
10913 		if (ifp->if_interface_state.interface_availability ==
10914 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10915 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10916 			    __func__, if_name(ifp), ifp->if_index);
10917 			if_index_available = ifp->if_index;
10918 		} else {
10919 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10920 			    __func__, if_name(ifp), ifp->if_index);
10921 		}
10922 	}
10923 	ifnet_lock_done(ifp);
10924 
10925 	/*
10926 	 * Check if the TCP connections going on this interface should be
10927 	 * forced to send probe packets instead of waiting for TCP timers
10928 	 * to fire. This is done on an explicit notification such as
10929 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10930 	 */
10931 	if (if_index_available > 0) {
10932 		tcp_interface_send_probe(if_index_available);
10933 	}
10934 
10935 	return 0;
10936 }
10937 
10938 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10939 if_get_state(struct ifnet *ifp,
10940     struct if_interface_state *if_interface_state)
10941 {
10942 	ifnet_lock_shared(ifp);
10943 
10944 	if_interface_state->valid_bitmask = 0;
10945 
10946 	if (ifp->if_interface_state.valid_bitmask &
10947 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10948 		if_interface_state->valid_bitmask |=
10949 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10950 		if_interface_state->rrc_state =
10951 		    ifp->if_interface_state.rrc_state;
10952 	}
10953 	if (ifp->if_interface_state.valid_bitmask &
10954 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10955 		if_interface_state->valid_bitmask |=
10956 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10957 		if_interface_state->lqm_state =
10958 		    ifp->if_interface_state.lqm_state;
10959 	}
10960 	if (ifp->if_interface_state.valid_bitmask &
10961 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10962 		if_interface_state->valid_bitmask |=
10963 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10964 		if_interface_state->interface_availability =
10965 		    ifp->if_interface_state.interface_availability;
10966 	}
10967 
10968 	ifnet_lock_done(ifp);
10969 }
10970 
10971 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10972 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10973 {
10974 	if (conn_probe > 1) {
10975 		return EINVAL;
10976 	}
10977 	if (conn_probe == 0) {
10978 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10979 	} else {
10980 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10981 	}
10982 
10983 #if NECP
10984 	necp_update_all_clients();
10985 #endif /* NECP */
10986 
10987 	tcp_probe_connectivity(ifp, conn_probe);
10988 	return 0;
10989 }
10990 
10991 /* for uuid.c */
10992 static int
get_ether_index(int * ret_other_index)10993 get_ether_index(int * ret_other_index)
10994 {
10995 	struct ifnet *ifp;
10996 	int en0_index = 0;
10997 	int other_en_index = 0;
10998 	int any_ether_index = 0;
10999 	short best_unit = 0;
11000 
11001 	*ret_other_index = 0;
11002 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11003 		/*
11004 		 * find en0, or if not en0, the lowest unit en*, and if not
11005 		 * that, any ethernet
11006 		 */
11007 		ifnet_lock_shared(ifp);
11008 		if (strcmp(ifp->if_name, "en") == 0) {
11009 			if (ifp->if_unit == 0) {
11010 				/* found en0, we're done */
11011 				en0_index = ifp->if_index;
11012 				ifnet_lock_done(ifp);
11013 				break;
11014 			}
11015 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
11016 				other_en_index = ifp->if_index;
11017 				best_unit = ifp->if_unit;
11018 			}
11019 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11020 			any_ether_index = ifp->if_index;
11021 		}
11022 		ifnet_lock_done(ifp);
11023 	}
11024 	if (en0_index == 0) {
11025 		if (other_en_index != 0) {
11026 			*ret_other_index = other_en_index;
11027 		} else if (any_ether_index != 0) {
11028 			*ret_other_index = any_ether_index;
11029 		}
11030 	}
11031 	return en0_index;
11032 }
11033 
11034 int
uuid_get_ethernet(u_int8_t * node)11035 uuid_get_ethernet(u_int8_t *node)
11036 {
11037 	static int en0_index;
11038 	struct ifnet *ifp;
11039 	int other_index = 0;
11040 	int the_index = 0;
11041 	int ret;
11042 
11043 	ifnet_head_lock_shared();
11044 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11045 		en0_index = get_ether_index(&other_index);
11046 	}
11047 	if (en0_index != 0) {
11048 		the_index = en0_index;
11049 	} else if (other_index != 0) {
11050 		the_index = other_index;
11051 	}
11052 	if (the_index != 0) {
11053 		struct dlil_ifnet *dl_if;
11054 
11055 		ifp = ifindex2ifnet[the_index];
11056 		VERIFY(ifp != NULL);
11057 		dl_if = (struct dlil_ifnet *)ifp;
11058 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
11059 			/*
11060 			 * Use the permanent ethernet address if it is
11061 			 * available because it will never change.
11062 			 */
11063 			memcpy(node, dl_if->dl_if_permanent_ether,
11064 			    ETHER_ADDR_LEN);
11065 		} else {
11066 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11067 		}
11068 		ret = 0;
11069 	} else {
11070 		ret = -1;
11071 	}
11072 	ifnet_head_done();
11073 	return ret;
11074 }
11075 
11076 static int
11077 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11078 {
11079 #pragma unused(arg1, arg2)
11080 	uint32_t i;
11081 	int err;
11082 
11083 	i = if_rxpoll;
11084 
11085 	err = sysctl_handle_int(oidp, &i, 0, req);
11086 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11087 		return err;
11088 	}
11089 
11090 	if (net_rxpoll == 0) {
11091 		return ENXIO;
11092 	}
11093 
11094 	if_rxpoll = i;
11095 	return err;
11096 }
11097 
11098 static int
11099 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11100 {
11101 #pragma unused(arg1, arg2)
11102 	uint64_t q;
11103 	int err;
11104 
11105 	q = if_rxpoll_mode_holdtime;
11106 
11107 	err = sysctl_handle_quad(oidp, &q, 0, req);
11108 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11109 		return err;
11110 	}
11111 
11112 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11113 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11114 	}
11115 
11116 	if_rxpoll_mode_holdtime = q;
11117 
11118 	return err;
11119 }
11120 
11121 static int
11122 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11123 {
11124 #pragma unused(arg1, arg2)
11125 	uint64_t q;
11126 	int err;
11127 
11128 	q = if_rxpoll_sample_holdtime;
11129 
11130 	err = sysctl_handle_quad(oidp, &q, 0, req);
11131 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11132 		return err;
11133 	}
11134 
11135 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11136 		q = IF_RXPOLL_SAMPLETIME_MIN;
11137 	}
11138 
11139 	if_rxpoll_sample_holdtime = q;
11140 
11141 	return err;
11142 }
11143 
11144 static int
11145 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11146 {
11147 #pragma unused(arg1, arg2)
11148 	uint64_t q;
11149 	int err;
11150 
11151 	q = if_rxpoll_interval_time;
11152 
11153 	err = sysctl_handle_quad(oidp, &q, 0, req);
11154 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11155 		return err;
11156 	}
11157 
11158 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11159 		q = IF_RXPOLL_INTERVALTIME_MIN;
11160 	}
11161 
11162 	if_rxpoll_interval_time = q;
11163 
11164 	return err;
11165 }
11166 
11167 static int
11168 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11169 {
11170 #pragma unused(arg1, arg2)
11171 	uint32_t i;
11172 	int err;
11173 
11174 	i = if_sysctl_rxpoll_wlowat;
11175 
11176 	err = sysctl_handle_int(oidp, &i, 0, req);
11177 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11178 		return err;
11179 	}
11180 
11181 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11182 		return EINVAL;
11183 	}
11184 
11185 	if_sysctl_rxpoll_wlowat = i;
11186 	return err;
11187 }
11188 
11189 static int
11190 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11191 {
11192 #pragma unused(arg1, arg2)
11193 	uint32_t i;
11194 	int err;
11195 
11196 	i = if_sysctl_rxpoll_whiwat;
11197 
11198 	err = sysctl_handle_int(oidp, &i, 0, req);
11199 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11200 		return err;
11201 	}
11202 
11203 	if (i <= if_sysctl_rxpoll_wlowat) {
11204 		return EINVAL;
11205 	}
11206 
11207 	if_sysctl_rxpoll_whiwat = i;
11208 	return err;
11209 }
11210 
11211 static int
11212 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11213 {
11214 #pragma unused(arg1, arg2)
11215 	int i, err;
11216 
11217 	i = if_sndq_maxlen;
11218 
11219 	err = sysctl_handle_int(oidp, &i, 0, req);
11220 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11221 		return err;
11222 	}
11223 
11224 	if (i < IF_SNDQ_MINLEN) {
11225 		i = IF_SNDQ_MINLEN;
11226 	}
11227 
11228 	if_sndq_maxlen = i;
11229 	return err;
11230 }
11231 
11232 static int
11233 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11234 {
11235 #pragma unused(arg1, arg2)
11236 	int i, err;
11237 
11238 	i = if_rcvq_maxlen;
11239 
11240 	err = sysctl_handle_int(oidp, &i, 0, req);
11241 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11242 		return err;
11243 	}
11244 
11245 	if (i < IF_RCVQ_MINLEN) {
11246 		i = IF_RCVQ_MINLEN;
11247 	}
11248 
11249 	if_rcvq_maxlen = i;
11250 	return err;
11251 }
11252 
11253 static int
11254 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11255 {
11256 #pragma unused(arg1, arg2)
11257 	int i, err;
11258 
11259 	i = if_rcvq_burst_limit;
11260 
11261 	err = sysctl_handle_int(oidp, &i, 0, req);
11262 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11263 		return err;
11264 	}
11265 
11266 /*
11267  * Safeguard the burst limit to "sane" values on customer builds.
11268  */
11269 #if !(DEVELOPMENT || DEBUG)
11270 	if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11271 		i = IF_RCVQ_BURST_LIMIT_MIN;
11272 	}
11273 
11274 	if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11275 		i = IF_RCVQ_BURST_LIMIT_MAX;
11276 	}
11277 #endif
11278 
11279 	if_rcvq_burst_limit = i;
11280 	return err;
11281 }
11282 
11283 static int
11284 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11285 {
11286 #pragma unused(arg1, arg2)
11287 	int i, err;
11288 
11289 	i = if_rcvq_burst_limit;
11290 
11291 	err = sysctl_handle_int(oidp, &i, 0, req);
11292 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11293 		return err;
11294 	}
11295 
11296 	if (IF_RCVQ_TRIM_PCT_MAX < i) {
11297 		i = IF_RCVQ_TRIM_PCT_MAX;
11298 	}
11299 
11300 	if (i < IF_RCVQ_TRIM_PCT_MIN) {
11301 		i = IF_RCVQ_TRIM_PCT_MIN;
11302 	}
11303 
11304 	if_rcvq_trim_pct = i;
11305 	return err;
11306 }
11307 
11308 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11309 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11310     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11311 {
11312 	struct kev_dl_node_presence kev;
11313 	struct sockaddr_dl *sdl;
11314 	struct sockaddr_in6 *sin6;
11315 	int ret = 0;
11316 
11317 	VERIFY(ifp);
11318 	VERIFY(sa);
11319 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11320 
11321 	bzero(&kev, sizeof(kev));
11322 	sin6 = &kev.sin6_node_address;
11323 	sdl = &kev.sdl_node_address;
11324 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11325 	kev.rssi = rssi;
11326 	kev.link_quality_metric = lqm;
11327 	kev.node_proximity_metric = npm;
11328 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11329 
11330 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11331 	if (ret == 0 || ret == EEXIST) {
11332 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11333 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11334 		if (err != 0) {
11335 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11336 			    "error %d\n", __func__, err);
11337 		}
11338 	}
11339 
11340 	if (ret == EEXIST) {
11341 		ret = 0;
11342 	}
11343 	return ret;
11344 }
11345 
11346 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11347 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11348 {
11349 	struct kev_dl_node_absence kev = {};
11350 	struct sockaddr_in6 *kev_sin6 = NULL;
11351 	struct sockaddr_dl *kev_sdl = NULL;
11352 	int error = 0;
11353 
11354 	VERIFY(ifp != NULL);
11355 	VERIFY(sa != NULL);
11356 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11357 
11358 	kev_sin6 = &kev.sin6_node_address;
11359 	kev_sdl = &kev.sdl_node_address;
11360 
11361 	if (sa->sa_family == AF_INET6) {
11362 		/*
11363 		 * If IPv6 address is given, get the link layer
11364 		 * address from what was cached in the neighbor cache
11365 		 */
11366 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11367 		bcopy(sa, kev_sin6, sa->sa_len);
11368 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11369 	} else {
11370 		/*
11371 		 * If passed address is AF_LINK type, derive the address
11372 		 * based on the link address.
11373 		 */
11374 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11375 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11376 	}
11377 
11378 	if (error == 0) {
11379 		kev_sdl->sdl_type = ifp->if_type;
11380 		kev_sdl->sdl_index = ifp->if_index;
11381 
11382 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11383 		    &kev.link_data, sizeof(kev), FALSE);
11384 	}
11385 }
11386 
11387 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11388 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11389     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11390 {
11391 	struct kev_dl_node_presence kev = {};
11392 	struct sockaddr_dl *kev_sdl = NULL;
11393 	struct sockaddr_in6 *kev_sin6 = NULL;
11394 	int ret = 0;
11395 
11396 	VERIFY(ifp != NULL);
11397 	VERIFY(sa != NULL && sdl != NULL);
11398 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11399 
11400 	kev_sin6 = &kev.sin6_node_address;
11401 	kev_sdl = &kev.sdl_node_address;
11402 
11403 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11404 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11405 	kev_sdl->sdl_type = ifp->if_type;
11406 	kev_sdl->sdl_index = ifp->if_index;
11407 
11408 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11409 	bcopy(sa, kev_sin6, sa->sa_len);
11410 
11411 	kev.rssi = rssi;
11412 	kev.link_quality_metric = lqm;
11413 	kev.node_proximity_metric = npm;
11414 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11415 
11416 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11417 	if (ret == 0 || ret == EEXIST) {
11418 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11419 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11420 		if (err != 0) {
11421 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11422 		}
11423 	}
11424 
11425 	if (ret == EEXIST) {
11426 		ret = 0;
11427 	}
11428 	return ret;
11429 }
11430 
11431 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11432 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11433     kauth_cred_t *credp)
11434 {
11435 	const u_int8_t *bytes;
11436 	size_t size;
11437 
11438 	bytes = CONST_LLADDR(sdl);
11439 	size = sdl->sdl_alen;
11440 
11441 #if CONFIG_MACF
11442 	if (dlil_lladdr_ckreq) {
11443 		switch (sdl->sdl_type) {
11444 		case IFT_ETHER:
11445 		case IFT_IEEE1394:
11446 			break;
11447 		default:
11448 			credp = NULL;
11449 			break;
11450 		}
11451 		;
11452 
11453 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11454 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11455 				[0] = 2
11456 			};
11457 
11458 			bytes = unspec;
11459 		}
11460 	}
11461 #else
11462 #pragma unused(credp)
11463 #endif
11464 
11465 	if (sizep != NULL) {
11466 		*sizep = size;
11467 	}
11468 	return bytes;
11469 }
11470 
11471 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11472 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11473     u_int8_t info[DLIL_MODARGLEN])
11474 {
11475 	struct kev_dl_issues kev;
11476 	struct timeval tv;
11477 
11478 	VERIFY(ifp != NULL);
11479 	VERIFY(modid != NULL);
11480 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11481 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11482 
11483 	bzero(&kev, sizeof(kev));
11484 
11485 	microtime(&tv);
11486 	kev.timestamp = tv.tv_sec;
11487 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11488 	if (info != NULL) {
11489 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11490 	}
11491 
11492 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11493 	    &kev.link_data, sizeof(kev), FALSE);
11494 }
11495 
11496 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11497 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11498     struct proc *p)
11499 {
11500 	u_int32_t level = IFNET_THROTTLE_OFF;
11501 	errno_t result = 0;
11502 
11503 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11504 
11505 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11506 		/*
11507 		 * XXX: Use priv_check_cred() instead of root check?
11508 		 */
11509 		if ((result = proc_suser(p)) != 0) {
11510 			return result;
11511 		}
11512 
11513 		if (ifr->ifr_opportunistic.ifo_flags ==
11514 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11515 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11516 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11517 			level = IFNET_THROTTLE_OFF;
11518 		} else {
11519 			result = EINVAL;
11520 		}
11521 
11522 		if (result == 0) {
11523 			result = ifnet_set_throttle(ifp, level);
11524 		}
11525 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11526 		ifr->ifr_opportunistic.ifo_flags = 0;
11527 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11528 			ifr->ifr_opportunistic.ifo_flags |=
11529 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11530 		}
11531 	}
11532 
11533 	/*
11534 	 * Return the count of current opportunistic connections
11535 	 * over the interface.
11536 	 */
11537 	if (result == 0) {
11538 		uint32_t flags = 0;
11539 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11540 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11541 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11542 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11543 		ifr->ifr_opportunistic.ifo_inuse =
11544 		    udp_count_opportunistic(ifp->if_index, flags) +
11545 		    tcp_count_opportunistic(ifp->if_index, flags);
11546 	}
11547 
11548 	if (result == EALREADY) {
11549 		result = 0;
11550 	}
11551 
11552 	return result;
11553 }
11554 
11555 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11556 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11557 {
11558 	struct ifclassq *ifq;
11559 	int err = 0;
11560 
11561 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11562 		return ENXIO;
11563 	}
11564 
11565 	*level = IFNET_THROTTLE_OFF;
11566 
11567 	ifq = ifp->if_snd;
11568 	IFCQ_LOCK(ifq);
11569 	/* Throttling works only for IFCQ, not ALTQ instances */
11570 	if (IFCQ_IS_ENABLED(ifq)) {
11571 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11572 
11573 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11574 		*level = req.level;
11575 	}
11576 	IFCQ_UNLOCK(ifq);
11577 
11578 	return err;
11579 }
11580 
11581 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11582 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11583 {
11584 	struct ifclassq *ifq;
11585 	int err = 0;
11586 
11587 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11588 		return ENXIO;
11589 	}
11590 
11591 	ifq = ifp->if_snd;
11592 
11593 	switch (level) {
11594 	case IFNET_THROTTLE_OFF:
11595 	case IFNET_THROTTLE_OPPORTUNISTIC:
11596 		break;
11597 	default:
11598 		return EINVAL;
11599 	}
11600 
11601 	IFCQ_LOCK(ifq);
11602 	if (IFCQ_IS_ENABLED(ifq)) {
11603 		cqrq_throttle_t req = { 1, level };
11604 
11605 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11606 	}
11607 	IFCQ_UNLOCK(ifq);
11608 
11609 	if (err == 0) {
11610 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11611 		    level);
11612 #if NECP
11613 		necp_update_all_clients();
11614 #endif /* NECP */
11615 		if (level == IFNET_THROTTLE_OFF) {
11616 			ifnet_start(ifp);
11617 		}
11618 	}
11619 
11620 	return err;
11621 }
11622 
11623 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11624 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11625     struct proc *p)
11626 {
11627 #pragma unused(p)
11628 	errno_t result = 0;
11629 	uint32_t flags;
11630 	int level, category, subcategory;
11631 
11632 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11633 
11634 	if (cmd == SIOCSIFLOG) {
11635 		if ((result = priv_check_cred(kauth_cred_get(),
11636 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11637 			return result;
11638 		}
11639 
11640 		level = ifr->ifr_log.ifl_level;
11641 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11642 			result = EINVAL;
11643 		}
11644 
11645 		flags = ifr->ifr_log.ifl_flags;
11646 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11647 			result = EINVAL;
11648 		}
11649 
11650 		category = ifr->ifr_log.ifl_category;
11651 		subcategory = ifr->ifr_log.ifl_subcategory;
11652 
11653 		if (result == 0) {
11654 			result = ifnet_set_log(ifp, level, flags,
11655 			    category, subcategory);
11656 		}
11657 	} else {
11658 		result = ifnet_get_log(ifp, &level, &flags, &category,
11659 		    &subcategory);
11660 		if (result == 0) {
11661 			ifr->ifr_log.ifl_level = level;
11662 			ifr->ifr_log.ifl_flags = flags;
11663 			ifr->ifr_log.ifl_category = category;
11664 			ifr->ifr_log.ifl_subcategory = subcategory;
11665 		}
11666 	}
11667 
11668 	return result;
11669 }
11670 
11671 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11672 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11673     int32_t category, int32_t subcategory)
11674 {
11675 	int err = 0;
11676 
11677 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11678 	VERIFY(flags & IFNET_LOGF_MASK);
11679 
11680 	/*
11681 	 * The logging level applies to all facilities; make sure to
11682 	 * update them all with the most current level.
11683 	 */
11684 	flags |= ifp->if_log.flags;
11685 
11686 	if (ifp->if_output_ctl != NULL) {
11687 		struct ifnet_log_params l;
11688 
11689 		bzero(&l, sizeof(l));
11690 		l.level = level;
11691 		l.flags = flags;
11692 		l.flags &= ~IFNET_LOGF_DLIL;
11693 		l.category = category;
11694 		l.subcategory = subcategory;
11695 
11696 		/* Send this request to lower layers */
11697 		if (l.flags != 0) {
11698 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11699 			    sizeof(l), &l);
11700 		}
11701 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11702 		/*
11703 		 * If targeted to the lower layers without an output
11704 		 * control callback registered on the interface, just
11705 		 * silently ignore facilities other than ours.
11706 		 */
11707 		flags &= IFNET_LOGF_DLIL;
11708 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11709 			level = 0;
11710 		}
11711 	}
11712 
11713 	if (err == 0) {
11714 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11715 			ifp->if_log.flags = 0;
11716 		} else {
11717 			ifp->if_log.flags |= flags;
11718 		}
11719 
11720 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11721 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11722 		    ifp->if_log.level, ifp->if_log.flags,
11723 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11724 		    category, subcategory);
11725 	}
11726 
11727 	return err;
11728 }
11729 
11730 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11731 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11732     int32_t *category, int32_t *subcategory)
11733 {
11734 	if (level != NULL) {
11735 		*level = ifp->if_log.level;
11736 	}
11737 	if (flags != NULL) {
11738 		*flags = ifp->if_log.flags;
11739 	}
11740 	if (category != NULL) {
11741 		*category = ifp->if_log.category;
11742 	}
11743 	if (subcategory != NULL) {
11744 		*subcategory = ifp->if_log.subcategory;
11745 	}
11746 
11747 	return 0;
11748 }
11749 
11750 int
ifnet_notify_address(struct ifnet * ifp,int af)11751 ifnet_notify_address(struct ifnet *ifp, int af)
11752 {
11753 	struct ifnet_notify_address_params na;
11754 
11755 #if PF
11756 	(void) pf_ifaddr_hook(ifp);
11757 #endif /* PF */
11758 
11759 	if (ifp->if_output_ctl == NULL) {
11760 		return EOPNOTSUPP;
11761 	}
11762 
11763 	bzero(&na, sizeof(na));
11764 	na.address_family = (sa_family_t)af;
11765 
11766 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11767 	           sizeof(na), &na);
11768 }
11769 
11770 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11771 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11772 {
11773 	if (ifp == NULL || flowid == NULL) {
11774 		return EINVAL;
11775 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11776 	    !IF_FULLY_ATTACHED(ifp)) {
11777 		return ENXIO;
11778 	}
11779 
11780 	*flowid = ifp->if_flowhash;
11781 
11782 	return 0;
11783 }
11784 
11785 errno_t
ifnet_disable_output(struct ifnet * ifp)11786 ifnet_disable_output(struct ifnet *ifp)
11787 {
11788 	int err;
11789 
11790 	if (ifp == NULL) {
11791 		return EINVAL;
11792 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11793 	    !IF_FULLY_ATTACHED(ifp)) {
11794 		return ENXIO;
11795 	}
11796 
11797 	if ((err = ifnet_fc_add(ifp)) == 0) {
11798 		lck_mtx_lock_spin(&ifp->if_start_lock);
11799 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11800 		lck_mtx_unlock(&ifp->if_start_lock);
11801 	}
11802 	return err;
11803 }
11804 
11805 errno_t
ifnet_enable_output(struct ifnet * ifp)11806 ifnet_enable_output(struct ifnet *ifp)
11807 {
11808 	if (ifp == NULL) {
11809 		return EINVAL;
11810 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11811 	    !IF_FULLY_ATTACHED(ifp)) {
11812 		return ENXIO;
11813 	}
11814 
11815 	ifnet_start_common(ifp, TRUE, FALSE);
11816 	return 0;
11817 }
11818 
11819 void
ifnet_flowadv(uint32_t flowhash)11820 ifnet_flowadv(uint32_t flowhash)
11821 {
11822 	struct ifnet_fc_entry *ifce;
11823 	struct ifnet *ifp;
11824 
11825 	ifce = ifnet_fc_get(flowhash);
11826 	if (ifce == NULL) {
11827 		return;
11828 	}
11829 
11830 	VERIFY(ifce->ifce_ifp != NULL);
11831 	ifp = ifce->ifce_ifp;
11832 
11833 	/* flow hash gets recalculated per attach, so check */
11834 	if (ifnet_is_attached(ifp, 1)) {
11835 		if (ifp->if_flowhash == flowhash) {
11836 			(void) ifnet_enable_output(ifp);
11837 		}
11838 		ifnet_decr_iorefcnt(ifp);
11839 	}
11840 	ifnet_fc_entry_free(ifce);
11841 }
11842 
11843 /*
11844  * Function to compare ifnet_fc_entries in ifnet flow control tree
11845  */
11846 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11847 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11848 {
11849 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11850 }
11851 
11852 static int
ifnet_fc_add(struct ifnet * ifp)11853 ifnet_fc_add(struct ifnet *ifp)
11854 {
11855 	struct ifnet_fc_entry keyfc, *ifce;
11856 	uint32_t flowhash;
11857 
11858 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11859 	VERIFY(ifp->if_flowhash != 0);
11860 	flowhash = ifp->if_flowhash;
11861 
11862 	bzero(&keyfc, sizeof(keyfc));
11863 	keyfc.ifce_flowhash = flowhash;
11864 
11865 	lck_mtx_lock_spin(&ifnet_fc_lock);
11866 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11867 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11868 		/* Entry is already in ifnet_fc_tree, return */
11869 		lck_mtx_unlock(&ifnet_fc_lock);
11870 		return 0;
11871 	}
11872 
11873 	if (ifce != NULL) {
11874 		/*
11875 		 * There is a different fc entry with the same flow hash
11876 		 * but different ifp pointer.  There can be a collision
11877 		 * on flow hash but the probability is low.  Let's just
11878 		 * avoid adding a second one when there is a collision.
11879 		 */
11880 		lck_mtx_unlock(&ifnet_fc_lock);
11881 		return EAGAIN;
11882 	}
11883 
11884 	/* become regular mutex */
11885 	lck_mtx_convert_spin(&ifnet_fc_lock);
11886 
11887 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11888 	ifce->ifce_flowhash = flowhash;
11889 	ifce->ifce_ifp = ifp;
11890 
11891 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11892 	lck_mtx_unlock(&ifnet_fc_lock);
11893 	return 0;
11894 }
11895 
11896 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11897 ifnet_fc_get(uint32_t flowhash)
11898 {
11899 	struct ifnet_fc_entry keyfc, *ifce;
11900 	struct ifnet *ifp;
11901 
11902 	bzero(&keyfc, sizeof(keyfc));
11903 	keyfc.ifce_flowhash = flowhash;
11904 
11905 	lck_mtx_lock_spin(&ifnet_fc_lock);
11906 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11907 	if (ifce == NULL) {
11908 		/* Entry is not present in ifnet_fc_tree, return */
11909 		lck_mtx_unlock(&ifnet_fc_lock);
11910 		return NULL;
11911 	}
11912 
11913 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11914 
11915 	VERIFY(ifce->ifce_ifp != NULL);
11916 	ifp = ifce->ifce_ifp;
11917 
11918 	/* become regular mutex */
11919 	lck_mtx_convert_spin(&ifnet_fc_lock);
11920 
11921 	if (!ifnet_is_attached(ifp, 0)) {
11922 		/*
11923 		 * This ifp is not attached or in the process of being
11924 		 * detached; just don't process it.
11925 		 */
11926 		ifnet_fc_entry_free(ifce);
11927 		ifce = NULL;
11928 	}
11929 	lck_mtx_unlock(&ifnet_fc_lock);
11930 
11931 	return ifce;
11932 }
11933 
11934 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11935 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11936 {
11937 	zfree(ifnet_fc_zone, ifce);
11938 }
11939 
11940 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11941 ifnet_calc_flowhash(struct ifnet *ifp)
11942 {
11943 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11944 	uint32_t flowhash = 0;
11945 
11946 	if (ifnet_flowhash_seed == 0) {
11947 		ifnet_flowhash_seed = RandomULong();
11948 	}
11949 
11950 	bzero(&fh, sizeof(fh));
11951 
11952 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11953 	fh.ifk_unit = ifp->if_unit;
11954 	fh.ifk_flags = ifp->if_flags;
11955 	fh.ifk_eflags = ifp->if_eflags;
11956 	fh.ifk_capabilities = ifp->if_capabilities;
11957 	fh.ifk_capenable = ifp->if_capenable;
11958 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11959 	fh.ifk_rand1 = RandomULong();
11960 	fh.ifk_rand2 = RandomULong();
11961 
11962 try_again:
11963 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11964 	if (flowhash == 0) {
11965 		/* try to get a non-zero flowhash */
11966 		ifnet_flowhash_seed = RandomULong();
11967 		goto try_again;
11968 	}
11969 
11970 	return flowhash;
11971 }
11972 
11973 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11974 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11975     uint16_t flags, uint8_t *data)
11976 {
11977 #pragma unused(flags)
11978 	int error = 0;
11979 
11980 	switch (family) {
11981 	case AF_INET:
11982 		if_inetdata_lock_exclusive(ifp);
11983 		if (IN_IFEXTRA(ifp) != NULL) {
11984 			if (len == 0) {
11985 				/* Allow clearing the signature */
11986 				IN_IFEXTRA(ifp)->netsig_len = 0;
11987 				bzero(IN_IFEXTRA(ifp)->netsig,
11988 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11989 				if_inetdata_lock_done(ifp);
11990 				break;
11991 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11992 				error = EINVAL;
11993 				if_inetdata_lock_done(ifp);
11994 				break;
11995 			}
11996 			IN_IFEXTRA(ifp)->netsig_len = len;
11997 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11998 		} else {
11999 			error = ENOMEM;
12000 		}
12001 		if_inetdata_lock_done(ifp);
12002 		break;
12003 
12004 	case AF_INET6:
12005 		if_inet6data_lock_exclusive(ifp);
12006 		if (IN6_IFEXTRA(ifp) != NULL) {
12007 			if (len == 0) {
12008 				/* Allow clearing the signature */
12009 				IN6_IFEXTRA(ifp)->netsig_len = 0;
12010 				bzero(IN6_IFEXTRA(ifp)->netsig,
12011 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
12012 				if_inet6data_lock_done(ifp);
12013 				break;
12014 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12015 				error = EINVAL;
12016 				if_inet6data_lock_done(ifp);
12017 				break;
12018 			}
12019 			IN6_IFEXTRA(ifp)->netsig_len = len;
12020 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12021 		} else {
12022 			error = ENOMEM;
12023 		}
12024 		if_inet6data_lock_done(ifp);
12025 		break;
12026 
12027 	default:
12028 		error = EINVAL;
12029 		break;
12030 	}
12031 
12032 	return error;
12033 }
12034 
12035 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12036 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12037     uint16_t *flags, uint8_t *data)
12038 {
12039 	int error = 0;
12040 
12041 	if (ifp == NULL || len == NULL || data == NULL) {
12042 		return EINVAL;
12043 	}
12044 
12045 	switch (family) {
12046 	case AF_INET:
12047 		if_inetdata_lock_shared(ifp);
12048 		if (IN_IFEXTRA(ifp) != NULL) {
12049 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12050 				error = EINVAL;
12051 				if_inetdata_lock_done(ifp);
12052 				break;
12053 			}
12054 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12055 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12056 			} else {
12057 				error = ENOENT;
12058 			}
12059 		} else {
12060 			error = ENOMEM;
12061 		}
12062 		if_inetdata_lock_done(ifp);
12063 		break;
12064 
12065 	case AF_INET6:
12066 		if_inet6data_lock_shared(ifp);
12067 		if (IN6_IFEXTRA(ifp) != NULL) {
12068 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12069 				error = EINVAL;
12070 				if_inet6data_lock_done(ifp);
12071 				break;
12072 			}
12073 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12074 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12075 			} else {
12076 				error = ENOENT;
12077 			}
12078 		} else {
12079 			error = ENOMEM;
12080 		}
12081 		if_inet6data_lock_done(ifp);
12082 		break;
12083 
12084 	default:
12085 		error = EINVAL;
12086 		break;
12087 	}
12088 
12089 	if (error == 0 && flags != NULL) {
12090 		*flags = 0;
12091 	}
12092 
12093 	return error;
12094 }
12095 
12096 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12097 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12098 {
12099 	int i, error = 0, one_set = 0;
12100 
12101 	if_inet6data_lock_exclusive(ifp);
12102 
12103 	if (IN6_IFEXTRA(ifp) == NULL) {
12104 		error = ENOMEM;
12105 		goto out;
12106 	}
12107 
12108 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12109 		uint32_t prefix_len =
12110 		    prefixes[i].prefix_len;
12111 		struct in6_addr *prefix =
12112 		    &prefixes[i].ipv6_prefix;
12113 
12114 		if (prefix_len == 0) {
12115 			clat_log0((LOG_DEBUG,
12116 			    "NAT64 prefixes purged from Interface %s\n",
12117 			    if_name(ifp)));
12118 			/* Allow clearing the signature */
12119 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12120 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12121 			    sizeof(struct in6_addr));
12122 
12123 			continue;
12124 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12125 		    prefix_len != NAT64_PREFIX_LEN_40 &&
12126 		    prefix_len != NAT64_PREFIX_LEN_48 &&
12127 		    prefix_len != NAT64_PREFIX_LEN_56 &&
12128 		    prefix_len != NAT64_PREFIX_LEN_64 &&
12129 		    prefix_len != NAT64_PREFIX_LEN_96) {
12130 			clat_log0((LOG_DEBUG,
12131 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
12132 			error = EINVAL;
12133 			goto out;
12134 		}
12135 
12136 		if (IN6_IS_SCOPE_EMBED(prefix)) {
12137 			clat_log0((LOG_DEBUG,
12138 			    "NAT64 prefix has interface/link local scope.\n"));
12139 			error = EINVAL;
12140 			goto out;
12141 		}
12142 
12143 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12144 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12145 		    sizeof(struct in6_addr));
12146 		clat_log0((LOG_DEBUG,
12147 		    "NAT64 prefix set to %s with prefixlen: %d\n",
12148 		    ip6_sprintf(prefix), prefix_len));
12149 		one_set = 1;
12150 	}
12151 
12152 out:
12153 	if_inet6data_lock_done(ifp);
12154 
12155 	if (error == 0 && one_set != 0) {
12156 		necp_update_all_clients();
12157 	}
12158 
12159 	return error;
12160 }
12161 
12162 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12163 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12164 {
12165 	int i, found_one = 0, error = 0;
12166 
12167 	if (ifp == NULL) {
12168 		return EINVAL;
12169 	}
12170 
12171 	if_inet6data_lock_shared(ifp);
12172 
12173 	if (IN6_IFEXTRA(ifp) == NULL) {
12174 		error = ENOMEM;
12175 		goto out;
12176 	}
12177 
12178 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12179 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12180 			found_one = 1;
12181 		}
12182 	}
12183 
12184 	if (found_one == 0) {
12185 		error = ENOENT;
12186 		goto out;
12187 	}
12188 
12189 	if (prefixes) {
12190 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12191 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12192 	}
12193 
12194 out:
12195 	if_inet6data_lock_done(ifp);
12196 
12197 	return error;
12198 }
12199 
12200 __attribute__((noinline))
12201 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12202 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12203     protocol_family_t pf)
12204 {
12205 #pragma unused(ifp)
12206 	uint32_t did_sw;
12207 
12208 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12209 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12210 		return;
12211 	}
12212 
12213 	switch (pf) {
12214 	case PF_INET:
12215 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12216 		if (did_sw & CSUM_DELAY_IP) {
12217 			hwcksum_dbg_finalized_hdr++;
12218 		}
12219 		if (did_sw & CSUM_DELAY_DATA) {
12220 			hwcksum_dbg_finalized_data++;
12221 		}
12222 		break;
12223 	case PF_INET6:
12224 		/*
12225 		 * Checksum offload should not have been enabled when
12226 		 * extension headers exist; that also means that we
12227 		 * cannot force-finalize packets with extension headers.
12228 		 * Indicate to the callee should it skip such case by
12229 		 * setting optlen to -1.
12230 		 */
12231 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12232 		    m->m_pkthdr.csum_flags);
12233 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
12234 			hwcksum_dbg_finalized_data++;
12235 		}
12236 		break;
12237 	default:
12238 		return;
12239 	}
12240 }
12241 
12242 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12243 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12244     protocol_family_t pf)
12245 {
12246 	uint16_t sum = 0;
12247 	uint32_t hlen;
12248 
12249 	if (frame_header == NULL ||
12250 	    frame_header < (char *)mbuf_datastart(m) ||
12251 	    frame_header > (char *)m->m_data) {
12252 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12253 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12254 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12255 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12256 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12257 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
12258 		return;
12259 	}
12260 	hlen = (uint32_t)(m->m_data - frame_header);
12261 
12262 	switch (pf) {
12263 	case PF_INET:
12264 	case PF_INET6:
12265 		break;
12266 	default:
12267 		return;
12268 	}
12269 
12270 	/*
12271 	 * Force partial checksum offload; useful to simulate cases
12272 	 * where the hardware does not support partial checksum offload,
12273 	 * in order to validate correctness throughout the layers above.
12274 	 */
12275 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12276 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12277 
12278 		if (foff > (uint32_t)m->m_pkthdr.len) {
12279 			return;
12280 		}
12281 
12282 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12283 
12284 		/* Compute 16-bit 1's complement sum from forced offset */
12285 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12286 
12287 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12288 		m->m_pkthdr.csum_rx_val = sum;
12289 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12290 
12291 		hwcksum_dbg_partial_forced++;
12292 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12293 	}
12294 
12295 	/*
12296 	 * Partial checksum offload verification (and adjustment);
12297 	 * useful to validate and test cases where the hardware
12298 	 * supports partial checksum offload.
12299 	 */
12300 	if ((m->m_pkthdr.csum_flags &
12301 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12302 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12303 		uint32_t rxoff;
12304 
12305 		/* Start offset must begin after frame header */
12306 		rxoff = m->m_pkthdr.csum_rx_start;
12307 		if (hlen > rxoff) {
12308 			hwcksum_dbg_bad_rxoff++;
12309 			if (dlil_verbose) {
12310 				DLIL_PRINTF("%s: partial cksum start offset %d "
12311 				    "is less than frame header length %d for "
12312 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12313 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
12314 			}
12315 			return;
12316 		}
12317 		rxoff -= hlen;
12318 
12319 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12320 			/*
12321 			 * Compute the expected 16-bit 1's complement sum;
12322 			 * skip this if we've already computed it above
12323 			 * when partial checksum offload is forced.
12324 			 */
12325 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12326 
12327 			/* Hardware or driver is buggy */
12328 			if (sum != m->m_pkthdr.csum_rx_val) {
12329 				hwcksum_dbg_bad_cksum++;
12330 				if (dlil_verbose) {
12331 					DLIL_PRINTF("%s: bad partial cksum value "
12332 					    "0x%x (expected 0x%x) for mbuf "
12333 					    "0x%llx [rx_start %d]\n",
12334 					    if_name(ifp),
12335 					    m->m_pkthdr.csum_rx_val, sum,
12336 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
12337 					    m->m_pkthdr.csum_rx_start);
12338 				}
12339 				return;
12340 			}
12341 		}
12342 		hwcksum_dbg_verified++;
12343 
12344 		/*
12345 		 * This code allows us to emulate various hardwares that
12346 		 * perform 16-bit 1's complement sum beginning at various
12347 		 * start offset values.
12348 		 */
12349 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12350 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12351 
12352 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12353 				return;
12354 			}
12355 
12356 			sum = m_adj_sum16(m, rxoff, aoff,
12357 			    m_pktlen(m) - aoff, sum);
12358 
12359 			m->m_pkthdr.csum_rx_val = sum;
12360 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12361 
12362 			hwcksum_dbg_adjusted++;
12363 		}
12364 	}
12365 }
12366 
12367 static int
12368 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12369 {
12370 #pragma unused(arg1, arg2)
12371 	u_int32_t i;
12372 	int err;
12373 
12374 	i = hwcksum_dbg_mode;
12375 
12376 	err = sysctl_handle_int(oidp, &i, 0, req);
12377 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12378 		return err;
12379 	}
12380 
12381 	if (hwcksum_dbg == 0) {
12382 		return ENODEV;
12383 	}
12384 
12385 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12386 		return EINVAL;
12387 	}
12388 
12389 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12390 
12391 	return err;
12392 }
12393 
12394 static int
12395 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12396 {
12397 #pragma unused(arg1, arg2)
12398 	u_int32_t i;
12399 	int err;
12400 
12401 	i = hwcksum_dbg_partial_rxoff_forced;
12402 
12403 	err = sysctl_handle_int(oidp, &i, 0, req);
12404 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12405 		return err;
12406 	}
12407 
12408 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12409 		return ENODEV;
12410 	}
12411 
12412 	hwcksum_dbg_partial_rxoff_forced = i;
12413 
12414 	return err;
12415 }
12416 
12417 static int
12418 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12419 {
12420 #pragma unused(arg1, arg2)
12421 	u_int32_t i;
12422 	int err;
12423 
12424 	i = hwcksum_dbg_partial_rxoff_adj;
12425 
12426 	err = sysctl_handle_int(oidp, &i, 0, req);
12427 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12428 		return err;
12429 	}
12430 
12431 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12432 		return ENODEV;
12433 	}
12434 
12435 	hwcksum_dbg_partial_rxoff_adj = i;
12436 
12437 	return err;
12438 }
12439 
12440 static int
12441 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12442 {
12443 #pragma unused(oidp, arg1, arg2)
12444 	int err;
12445 
12446 	if (req->oldptr == USER_ADDR_NULL) {
12447 	}
12448 	if (req->newptr != USER_ADDR_NULL) {
12449 		return EPERM;
12450 	}
12451 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12452 	    sizeof(struct chain_len_stats));
12453 
12454 	return err;
12455 }
12456 
12457 #if DEBUG || DEVELOPMENT
12458 /* Blob for sum16 verification */
12459 static uint8_t sumdata[] = {
12460 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12461 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12462 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12463 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12464 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12465 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12466 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12467 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12468 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12469 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12470 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12471 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12472 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12473 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12474 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12475 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12476 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12477 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12478 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12479 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12480 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12481 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12482 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12483 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12484 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12485 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12486 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12487 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12488 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12489 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12490 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12491 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12492 	0xc8, 0x28, 0x02, 0x00, 0x00
12493 };
12494 
12495 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12496 static struct {
12497 	boolean_t       init;
12498 	uint16_t        len;
12499 	uint16_t        sumr;   /* reference */
12500 	uint16_t        sumrp;  /* reference, precomputed */
12501 } sumtbl[] = {
12502 	{ FALSE, 0, 0, 0x0000 },
12503 	{ FALSE, 1, 0, 0x001f },
12504 	{ FALSE, 2, 0, 0x8b1f },
12505 	{ FALSE, 3, 0, 0x8b27 },
12506 	{ FALSE, 7, 0, 0x790e },
12507 	{ FALSE, 11, 0, 0xcb6d },
12508 	{ FALSE, 20, 0, 0x20dd },
12509 	{ FALSE, 27, 0, 0xbabd },
12510 	{ FALSE, 32, 0, 0xf3e8 },
12511 	{ FALSE, 37, 0, 0x197d },
12512 	{ FALSE, 43, 0, 0x9eae },
12513 	{ FALSE, 64, 0, 0x4678 },
12514 	{ FALSE, 127, 0, 0x9399 },
12515 	{ FALSE, 256, 0, 0xd147 },
12516 	{ FALSE, 325, 0, 0x0358 },
12517 };
12518 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12519 
12520 static void
dlil_verify_sum16(void)12521 dlil_verify_sum16(void)
12522 {
12523 	struct mbuf *m;
12524 	uint8_t *buf;
12525 	int n;
12526 
12527 	/* Make sure test data plus extra room for alignment fits in cluster */
12528 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12529 
12530 	kprintf("DLIL: running SUM16 self-tests ... ");
12531 
12532 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12533 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12534 
12535 	buf = mtod(m, uint8_t *);               /* base address */
12536 
12537 	for (n = 0; n < SUMTBL_MAX; n++) {
12538 		uint16_t len = sumtbl[n].len;
12539 		int i;
12540 
12541 		/* Verify for all possible alignments */
12542 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12543 			uint16_t sum, sumr;
12544 			uint8_t *c;
12545 
12546 			/* Copy over test data to mbuf */
12547 			VERIFY(len <= sizeof(sumdata));
12548 			c = buf + i;
12549 			bcopy(sumdata, c, len);
12550 
12551 			/* Zero-offset test (align by data pointer) */
12552 			m->m_data = (caddr_t)c;
12553 			m->m_len = len;
12554 			sum = m_sum16(m, 0, len);
12555 
12556 			if (!sumtbl[n].init) {
12557 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12558 				sumtbl[n].sumr = sumr;
12559 				sumtbl[n].init = TRUE;
12560 			} else {
12561 				sumr = sumtbl[n].sumr;
12562 			}
12563 
12564 			/* Something is horribly broken; stop now */
12565 			if (sumr != sumtbl[n].sumrp) {
12566 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12567 				    "for len=%d align=%d sum=0x%04x "
12568 				    "[expected=0x%04x]\n", __func__,
12569 				    len, i, sum, sumr);
12570 				/* NOTREACHED */
12571 			} else if (sum != sumr) {
12572 				panic_plain("\n%s: broken m_sum16() for len=%d "
12573 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12574 				    __func__, len, i, sum, sumr);
12575 				/* NOTREACHED */
12576 			}
12577 
12578 			/* Alignment test by offset (fixed data pointer) */
12579 			m->m_data = (caddr_t)buf;
12580 			m->m_len = i + len;
12581 			sum = m_sum16(m, i, len);
12582 
12583 			/* Something is horribly broken; stop now */
12584 			if (sum != sumr) {
12585 				panic_plain("\n%s: broken m_sum16() for len=%d "
12586 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12587 				    __func__, len, i, sum, sumr);
12588 				/* NOTREACHED */
12589 			}
12590 #if INET
12591 			/* Simple sum16 contiguous buffer test by aligment */
12592 			sum = b_sum16(c, len);
12593 
12594 			/* Something is horribly broken; stop now */
12595 			if (sum != sumr) {
12596 				panic_plain("\n%s: broken b_sum16() for len=%d "
12597 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12598 				    __func__, len, i, sum, sumr);
12599 				/* NOTREACHED */
12600 			}
12601 #endif /* INET */
12602 		}
12603 	}
12604 	m_freem(m);
12605 
12606 	kprintf("PASSED\n");
12607 }
12608 #endif /* DEBUG || DEVELOPMENT */
12609 
12610 #define CASE_STRINGIFY(x) case x: return #x
12611 
12612 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12613 dlil_kev_dl_code_str(u_int32_t event_code)
12614 {
12615 	switch (event_code) {
12616 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12617 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12618 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12619 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12620 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12621 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12622 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12623 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12624 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12625 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12626 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12627 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12628 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12629 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12630 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12631 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12632 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12633 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12634 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12635 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12636 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12637 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12638 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12639 		CASE_STRINGIFY(KEV_DL_ISSUES);
12640 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12641 	default:
12642 		break;
12643 	}
12644 	return "";
12645 }
12646 
12647 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12648 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12649 {
12650 #pragma unused(arg1)
12651 	struct ifnet *ifp = arg0;
12652 
12653 	if (ifnet_is_attached(ifp, 1)) {
12654 		nstat_ifnet_threshold_reached(ifp->if_index);
12655 		ifnet_decr_iorefcnt(ifp);
12656 	}
12657 }
12658 
12659 void
ifnet_notify_data_threshold(struct ifnet * ifp)12660 ifnet_notify_data_threshold(struct ifnet *ifp)
12661 {
12662 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12663 	uint64_t oldbytes = ifp->if_dt_bytes;
12664 
12665 	ASSERT(ifp->if_dt_tcall != NULL);
12666 
12667 	/*
12668 	 * If we went over the threshold, notify NetworkStatistics.
12669 	 * We rate-limit it based on the threshold interval value.
12670 	 */
12671 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12672 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12673 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12674 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12675 		uint64_t now = mach_absolute_time(), deadline = now;
12676 		uint64_t ival;
12677 
12678 		if (tival != 0) {
12679 			nanoseconds_to_absolutetime(tival, &ival);
12680 			clock_deadline_for_periodic_event(ival, now, &deadline);
12681 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12682 			    deadline);
12683 		} else {
12684 			(void) thread_call_enter(ifp->if_dt_tcall);
12685 		}
12686 	}
12687 }
12688 
12689 #if (DEVELOPMENT || DEBUG)
12690 /*
12691  * The sysctl variable name contains the input parameters of
12692  * ifnet_get_keepalive_offload_frames()
12693  *  ifp (interface index): name[0]
12694  *  frames_array_count:    name[1]
12695  *  frame_data_offset:     name[2]
12696  * The return length gives used_frames_count
12697  */
12698 static int
12699 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12700 {
12701 #pragma unused(oidp)
12702 	int *name = (int *)arg1;
12703 	u_int namelen = arg2;
12704 	int idx;
12705 	ifnet_t ifp = NULL;
12706 	u_int32_t frames_array_count;
12707 	size_t frame_data_offset;
12708 	u_int32_t used_frames_count;
12709 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12710 	int error = 0;
12711 	u_int32_t i;
12712 
12713 	/*
12714 	 * Only root can get look at other people TCP frames
12715 	 */
12716 	error = proc_suser(current_proc());
12717 	if (error != 0) {
12718 		goto done;
12719 	}
12720 	/*
12721 	 * Validate the input parameters
12722 	 */
12723 	if (req->newptr != USER_ADDR_NULL) {
12724 		error = EPERM;
12725 		goto done;
12726 	}
12727 	if (namelen != 3) {
12728 		error = EINVAL;
12729 		goto done;
12730 	}
12731 	if (req->oldptr == USER_ADDR_NULL) {
12732 		error = EINVAL;
12733 		goto done;
12734 	}
12735 	if (req->oldlen == 0) {
12736 		error = EINVAL;
12737 		goto done;
12738 	}
12739 	idx = name[0];
12740 	frames_array_count = name[1];
12741 	frame_data_offset = name[2];
12742 
12743 	/* Make sure the passed buffer is large enough */
12744 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12745 	    req->oldlen) {
12746 		error = ENOMEM;
12747 		goto done;
12748 	}
12749 
12750 	ifnet_head_lock_shared();
12751 	if (!IF_INDEX_IN_RANGE(idx)) {
12752 		ifnet_head_done();
12753 		error = ENOENT;
12754 		goto done;
12755 	}
12756 	ifp = ifindex2ifnet[idx];
12757 	ifnet_head_done();
12758 
12759 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12760 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12761 		Z_WAITOK);
12762 	if (frames_array == NULL) {
12763 		error = ENOMEM;
12764 		goto done;
12765 	}
12766 
12767 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12768 	    frames_array_count, frame_data_offset, &used_frames_count);
12769 	if (error != 0) {
12770 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12771 		    __func__, error);
12772 		goto done;
12773 	}
12774 
12775 	for (i = 0; i < used_frames_count; i++) {
12776 		error = SYSCTL_OUT(req, frames_array + i,
12777 		    sizeof(struct ifnet_keepalive_offload_frame));
12778 		if (error != 0) {
12779 			goto done;
12780 		}
12781 	}
12782 done:
12783 	if (frames_array != NULL) {
12784 		kfree_data(frames_array, frames_array_count *
12785 		    sizeof(struct ifnet_keepalive_offload_frame));
12786 	}
12787 	return error;
12788 }
12789 #endif /* DEVELOPMENT || DEBUG */
12790 
12791 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12792 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12793     struct ifnet *ifp)
12794 {
12795 	tcp_update_stats_per_flow(ifs, ifp);
12796 }
12797 
12798 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12799 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12800 {
12801 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12802 }
12803 
12804 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12805 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12806 {
12807 	OSBitAndAtomic(~clear_flags, flags_p);
12808 }
12809 
12810 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12811 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12812 {
12813 	return _set_flags(&interface->if_eflags, set_flags);
12814 }
12815 
12816 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12817 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12818 {
12819 	_clear_flags(&interface->if_eflags, clear_flags);
12820 }
12821 
12822 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12823 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12824 {
12825 	return _set_flags(&interface->if_xflags, set_flags);
12826 }
12827 
12828 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12829 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12830 {
12831 	_clear_flags(&interface->if_xflags, clear_flags);
12832 }
12833 
12834 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12835 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12836 {
12837 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12838 }
12839 
12840 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12841 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12842 {
12843 	if (*genid != ifp->if_traffic_rule_genid) {
12844 		*genid = ifp->if_traffic_rule_genid;
12845 		return TRUE;
12846 	}
12847 	return FALSE;
12848 }
12849 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12850 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12851 {
12852 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12853 	ifnet_update_traffic_rule_genid(ifp);
12854 }
12855 
12856 static void
log_hexdump(void * data,size_t len)12857 log_hexdump(void *data, size_t len)
12858 {
12859 	size_t i, j, k;
12860 	unsigned char *ptr = (unsigned char *)data;
12861 #define MAX_DUMP_BUF 32
12862 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12863 
12864 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12865 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12866 			unsigned char msnbl = ptr[j] >> 4;
12867 			unsigned char lsnbl = ptr[j] & 0x0f;
12868 
12869 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12870 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12871 
12872 			if ((j % 2) == 1) {
12873 				buf[k++] = ' ';
12874 			}
12875 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12876 				buf[k++] = ' ';
12877 			}
12878 		}
12879 		buf[k] = 0;
12880 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12881 	}
12882 }
12883 
12884 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12885 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12886 net_check_compatible_if_filter(struct ifnet *ifp)
12887 {
12888 	if (ifp == NULL) {
12889 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12890 			return false;
12891 		}
12892 	} else {
12893 		if (ifp->if_flt_non_os_count > 0) {
12894 			return false;
12895 		}
12896 	}
12897 	return true;
12898 }
12899 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12900 
12901 #define DUMP_BUF_CHK() {        \
12902 	clen -= k;              \
12903 	if (clen < 1)           \
12904 	        goto done;      \
12905 	c += k;                 \
12906 }
12907 
12908 int dlil_dump_top_if_qlen(char *, int);
12909 int
dlil_dump_top_if_qlen(char * str,int str_len)12910 dlil_dump_top_if_qlen(char *str, int str_len)
12911 {
12912 	char *c = str;
12913 	int k, clen = str_len;
12914 	struct ifnet *top_ifcq_ifp = NULL;
12915 	uint32_t top_ifcq_len = 0;
12916 	struct ifnet *top_inq_ifp = NULL;
12917 	uint32_t top_inq_len = 0;
12918 
12919 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12920 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12921 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12922 
12923 		if (ifp == NULL) {
12924 			continue;
12925 		}
12926 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12927 			top_ifcq_len = ifp->if_snd->ifcq_len;
12928 			top_ifcq_ifp = ifp;
12929 		}
12930 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12931 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12932 			top_inq_ifp = ifp;
12933 		}
12934 	}
12935 
12936 	if (top_ifcq_ifp != NULL) {
12937 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12938 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12939 		DUMP_BUF_CHK();
12940 	}
12941 	if (top_inq_ifp != NULL) {
12942 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12943 		    top_inq_len, top_inq_ifp->if_xname);
12944 		DUMP_BUF_CHK();
12945 	}
12946 done:
12947 	return str_len - clen;
12948 }
12949