xref: /xnu-10002.1.13/bsd/net/dlil.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63 
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70 
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/if_llatbl.h>
85 #include <net/net_api_stats.h>
86 #include <net/if_ports_used.h>
87 #include <net/if_vlan_var.h>
88 #include <netinet/in.h>
89 #if INET
90 #include <netinet/in_var.h>
91 #include <netinet/igmp_var.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_var.h>
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/if_ether.h>
98 #include <netinet/in_pcb.h>
99 #include <netinet/in_tclass.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip_icmp.h>
102 #include <netinet/icmp_var.h>
103 #endif /* INET */
104 
105 #include <net/nat464_utils.h>
106 #include <netinet6/in6_var.h>
107 #include <netinet6/nd6.h>
108 #include <netinet6/mld6_var.h>
109 #include <netinet6/scope6_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet/icmp6.h>
112 #include <net/pf_pbuf.h>
113 #include <libkern/OSAtomic.h>
114 #include <libkern/tree.h>
115 
116 #include <dev/random/randomdev.h>
117 #include <machine/machine_routines.h>
118 
119 #include <mach/thread_act.h>
120 #include <mach/sdt.h>
121 
122 #if CONFIG_MACF
123 #include <sys/kauth.h>
124 #include <security/mac_framework.h>
125 #include <net/ethernet.h>
126 #include <net/firewire.h>
127 #endif
128 
129 #if PF
130 #include <net/pfvar.h>
131 #endif /* PF */
132 #include <net/pktsched/pktsched.h>
133 #include <net/pktsched/pktsched_netem.h>
134 
135 #if NECP
136 #include <net/necp.h>
137 #endif /* NECP */
138 
139 #if SKYWALK
140 #include <skywalk/packet/packet_queue.h>
141 #include <skywalk/nexus/netif/nx_netif.h>
142 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
143 #endif /* SKYWALK */
144 
145 #include <os/log.h>
146 
147 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
148 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
149 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
150 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
151 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
152 
153 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
154 #define MAX_LINKADDR        4 /* LONGWORDS */
155 
156 #if 1
157 #define DLIL_PRINTF     printf
158 #else
159 #define DLIL_PRINTF     kprintf
160 #endif
161 
162 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
163 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
164 
165 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
166 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
167 
168 enum {
169 	kProtoKPI_v1    = 1,
170 	kProtoKPI_v2    = 2
171 };
172 
173 uint64_t if_creation_generation_count = 0;
174 
175 /*
176  * List of if_proto structures in if_proto_hash[] is protected by
177  * the ifnet lock.  The rest of the fields are initialized at protocol
178  * attach time and never change, thus no lock required as long as
179  * a reference to it is valid, via if_proto_ref().
180  */
181 struct if_proto {
182 	SLIST_ENTRY(if_proto)       next_hash;
183 	u_int32_t                   refcount;
184 	u_int32_t                   detached;
185 	struct ifnet                *ifp;
186 	protocol_family_t           protocol_family;
187 	int                         proto_kpi;
188 	union {
189 		struct {
190 			proto_media_input               input;
191 			proto_media_preout              pre_output;
192 			proto_media_event               event;
193 			proto_media_ioctl               ioctl;
194 			proto_media_detached            detached;
195 			proto_media_resolve_multi       resolve_multi;
196 			proto_media_send_arp            send_arp;
197 		} v1;
198 		struct {
199 			proto_media_input_v2            input;
200 			proto_media_preout              pre_output;
201 			proto_media_event               event;
202 			proto_media_ioctl               ioctl;
203 			proto_media_detached            detached;
204 			proto_media_resolve_multi       resolve_multi;
205 			proto_media_send_arp            send_arp;
206 		} v2;
207 	} kpi;
208 };
209 
210 SLIST_HEAD(proto_hash_entry, if_proto);
211 
212 #define DLIL_SDLDATALEN \
213 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
214 
215 struct dlil_ifnet {
216 	struct ifnet    dl_if;                  /* public ifnet */
217 	/*
218 	 * DLIL private fields, protected by dl_if_lock
219 	 */
220 	decl_lck_mtx_data(, dl_if_lock);
221 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
222 	u_int32_t dl_if_flags;                  /* flags (below) */
223 	u_int32_t dl_if_refcnt;                 /* refcnt */
224 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
225 	void    *dl_if_uniqueid;                /* unique interface id */
226 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
227 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
228 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
229 	struct {
230 		struct ifaddr   ifa;            /* lladdr ifa */
231 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
232 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
233 	} dl_if_lladdr;
234 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
235 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
236 	u_int8_t dl_if_permanent_ether_is_set;
237 	u_int8_t dl_if_unused;
238 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
239 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
240 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
241 };
242 
243 /* Values for dl_if_flags (private to DLIL) */
244 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
245 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
246 #define DLIF_DEBUG      0x4     /* has debugging info */
247 
248 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
249 
250 /* For gdb */
251 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
252 
253 struct dlil_ifnet_dbg {
254 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
255 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
256 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
257 	/*
258 	 * Circular lists of ifnet_{reference,release} callers.
259 	 */
260 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
261 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
262 };
263 
264 #define DLIL_TO_IFP(s)  (&s->dl_if)
265 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
266 
267 struct ifnet_filter {
268 	TAILQ_ENTRY(ifnet_filter)       filt_next;
269 	u_int32_t                       filt_skip;
270 	u_int32_t                       filt_flags;
271 	ifnet_t                         filt_ifp;
272 	const char                      *filt_name;
273 	void                            *filt_cookie;
274 	protocol_family_t               filt_protocol;
275 	iff_input_func                  filt_input;
276 	iff_output_func                 filt_output;
277 	iff_event_func                  filt_event;
278 	iff_ioctl_func                  filt_ioctl;
279 	iff_detached_func               filt_detached;
280 };
281 
282 /* Mbuf queue used for freeing the excessive mbufs */
283 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
284 
285 struct proto_input_entry;
286 
287 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
288 
289 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
290 
291 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
292 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
293 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
294 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
295 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
296 
297 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
298 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
299     &dlil_lck_attributes);
300 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
301     &dlil_lck_attributes);
302 
303 #if DEBUG
304 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
305 #else
306 static unsigned int ifnet_debug;        /* debugging (disabled) */
307 #endif /* !DEBUG */
308 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
309 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
310 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
311 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
312 
313 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
314 
315 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
316 
317 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
318 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
319 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
320 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
321 
322 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
323 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
324 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
325 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
326 
327 static u_int32_t net_rtref;
328 
329 static struct dlil_main_threading_info dlil_main_input_thread_info;
330 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
331     (struct dlil_threading_info *)&dlil_main_input_thread_info;
332 
333 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
334 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
335 static void dlil_if_trace(struct dlil_ifnet *, int);
336 static void if_proto_ref(struct if_proto *);
337 static void if_proto_free(struct if_proto *);
338 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
339 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
340     u_int32_t list_count);
341 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
342 static void if_flt_monitor_busy(struct ifnet *);
343 static void if_flt_monitor_unbusy(struct ifnet *);
344 static void if_flt_monitor_enter(struct ifnet *);
345 static void if_flt_monitor_leave(struct ifnet *);
346 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
347     char **, protocol_family_t);
348 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
349     protocol_family_t);
350 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
351     const struct sockaddr_dl *);
352 static int ifnet_lookup(struct ifnet *);
353 static void if_purgeaddrs(struct ifnet *);
354 
355 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
356     struct mbuf *, char *);
357 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
358     struct mbuf *);
359 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
360     mbuf_t *, const struct sockaddr *, void *, char *, char *);
361 static void ifproto_media_event(struct ifnet *, protocol_family_t,
362     const struct kev_msg *);
363 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
364     unsigned long, void *);
365 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
366     struct sockaddr_dl *, size_t);
367 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
368     const struct sockaddr_dl *, const struct sockaddr *,
369     const struct sockaddr_dl *, const struct sockaddr *);
370 
371 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
372     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
373     boolean_t poll, struct thread *tp);
374 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
375     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
376 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
377 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
378     protocol_family_t *);
379 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
380     const struct ifnet_demux_desc *, u_int32_t);
381 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
382 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
383 #if !XNU_TARGET_OS_OSX
384 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
385     const struct sockaddr *, const char *, const char *,
386     u_int32_t *, u_int32_t *);
387 #else /* XNU_TARGET_OS_OSX */
388 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
389     const struct sockaddr *, const char *, const char *);
390 #endif /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
392     const struct sockaddr *, const char *, const char *,
393     u_int32_t *, u_int32_t *);
394 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
395 static void ifp_if_free(struct ifnet *);
396 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
397 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
398 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
399 
400 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
401     dlil_freeq_t *, struct ifnet_stat_increment_param *);
402 
403 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
404     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405     boolean_t, struct thread *);
406 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
407     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
408     boolean_t, struct thread *);
409 
410 static void dlil_main_input_thread_func(void *, wait_result_t);
411 static void dlil_main_input_thread_cont(void *, wait_result_t);
412 
413 static void dlil_input_thread_func(void *, wait_result_t);
414 static void dlil_input_thread_cont(void *, wait_result_t);
415 
416 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
417 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
418 
419 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
420     thread_continue_t *);
421 static void dlil_terminate_input_thread(struct dlil_threading_info *);
422 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
423     struct dlil_threading_info *, struct ifnet *, boolean_t);
424 static boolean_t dlil_input_stats_sync(struct ifnet *,
425     struct dlil_threading_info *);
426 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
427     u_int32_t, ifnet_model_t, boolean_t);
428 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
429     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
430 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
431 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
432 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
433 #if DEBUG || DEVELOPMENT
434 static void dlil_verify_sum16(void);
435 #endif /* DEBUG || DEVELOPMENT */
436 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
437     protocol_family_t);
438 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
439     protocol_family_t);
440 
441 static void dlil_incr_pending_thread_count(void);
442 static void dlil_decr_pending_thread_count(void);
443 
444 static void ifnet_detacher_thread_func(void *, wait_result_t);
445 static void ifnet_detacher_thread_cont(void *, wait_result_t);
446 static void ifnet_detach_final(struct ifnet *);
447 static void ifnet_detaching_enqueue(struct ifnet *);
448 static struct ifnet *ifnet_detaching_dequeue(void);
449 
450 static void ifnet_start_thread_func(void *, wait_result_t);
451 static void ifnet_start_thread_cont(void *, wait_result_t);
452 
453 static void ifnet_poll_thread_func(void *, wait_result_t);
454 static void ifnet_poll_thread_cont(void *, wait_result_t);
455 
456 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
457     classq_pkt_t *, boolean_t, boolean_t *);
458 
459 static void ifp_src_route_copyout(struct ifnet *, struct route *);
460 static void ifp_src_route_copyin(struct ifnet *, struct route *);
461 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
462 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
463 
464 static errno_t if_mcasts_update_async(struct ifnet *);
465 
466 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
471 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
472 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
475 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
478 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
479 
480 struct chain_len_stats tx_chain_len_stats;
481 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
482 
483 #if TEST_INPUT_THREAD_TERMINATION
484 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
485 #endif /* TEST_INPUT_THREAD_TERMINATION */
486 
487 /* The following are protected by dlil_ifnet_lock */
488 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
489 static u_int32_t ifnet_detaching_cnt;
490 static boolean_t ifnet_detaching_embryonic;
491 static void *ifnet_delayed_run; /* wait channel for detaching thread */
492 
493 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
494     &dlil_lck_attributes);
495 
496 static uint32_t ifnet_flowhash_seed;
497 
498 struct ifnet_flowhash_key {
499 	char            ifk_name[IFNAMSIZ];
500 	uint32_t        ifk_unit;
501 	uint32_t        ifk_flags;
502 	uint32_t        ifk_eflags;
503 	uint32_t        ifk_capabilities;
504 	uint32_t        ifk_capenable;
505 	uint32_t        ifk_output_sched_model;
506 	uint32_t        ifk_rand1;
507 	uint32_t        ifk_rand2;
508 };
509 
510 /* Flow control entry per interface */
511 struct ifnet_fc_entry {
512 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
513 	u_int32_t       ifce_flowhash;
514 	struct ifnet    *ifce_ifp;
515 };
516 
517 static uint32_t ifnet_calc_flowhash(struct ifnet *);
518 static int ifce_cmp(const struct ifnet_fc_entry *,
519     const struct ifnet_fc_entry *);
520 static int ifnet_fc_add(struct ifnet *);
521 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
522 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
523 
524 /* protected by ifnet_fc_lock */
525 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
526 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
528 
529 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
530 
531 extern void bpfdetach(struct ifnet *);
532 extern void proto_input_run(void);
533 
534 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
535     u_int32_t flags);
536 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
537     u_int32_t flags);
538 
539 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
540 
541 #if CONFIG_MACF
542 #if !XNU_TARGET_OS_OSX
543 int dlil_lladdr_ckreq = 1;
544 #else /* XNU_TARGET_OS_OSX */
545 int dlil_lladdr_ckreq = 0;
546 #endif /* XNU_TARGET_OS_OSX */
547 #endif /* CONFIG_MACF */
548 
549 #if DEBUG
550 int dlil_verbose = 1;
551 #else
552 int dlil_verbose = 0;
553 #endif /* DEBUG */
554 #if IFNET_INPUT_SANITY_CHK
555 /* sanity checking of input packet lists received */
556 static u_int32_t dlil_input_sanity_check = 0;
557 #endif /* IFNET_INPUT_SANITY_CHK */
558 /* rate limit debug messages */
559 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
560 
561 SYSCTL_DECL(_net_link_generic_system);
562 
563 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
564     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
565 
566 #define IF_SNDQ_MINLEN  32
567 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
568 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
569     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
570     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
571 
572 #define IF_RCVQ_MINLEN  32
573 #define IF_RCVQ_MAXLEN  256
574 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
575 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
576     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
577     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
578 
579 /*
580  * Protect against possible memory starvation that may happen
581  * when the driver is pushing data faster than the AP can process.
582  *
583  * If at any point during DLIL input phase any of the input queues
584  * exceeds the burst limit, DLIL will start to trim the queue,
585  * by returning mbufs in the input queue to the cache from which
586  * the mbufs were originally allocated, starting from the oldest
587  * mbuf and continuing until the new limit (see below) is reached.
588  *
589  * In order to avoid a steplocked equilibrium, the trimming
590  * will continue PAST the burst limit, until the corresponding
591  * input queue is reduced to `if_rcvq_trim_pct' %.
592  *
593  * For example, if the input queue limit is 1024 packets,
594  * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
595  * the trimming will continue until the queue contains 819 packets
596  * (1024 * 80 / 100 == 819).
597  *
598  * Setting the burst limit too low can hurt the throughput,
599  * while setting the burst limit too high can defeat the purpose.
600  */
601 #define IF_RCVQ_BURST_LIMIT_MIN         1024
602 #define IF_RCVQ_BURST_LIMIT_DEFAULT     8192
603 #define IF_RCVQ_BURST_LIMIT_MAX         32768
604 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
605 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
606     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
607     sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
608 
609 #define IF_RCVQ_TRIM_PCT_MIN            20
610 #define IF_RCVQ_TRIM_PCT_DEFAULT        80
611 #define IF_RCVQ_TRIM_PCT_MAX            100
612 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
613 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
614     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
615     sysctl_rcvq_trim_pct, "I",
616     "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
617 
618 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
619 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
621     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
622     "ilog2 of EWMA decay rate of avg inbound packets");
623 
624 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
625 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
626 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
627 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
628     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
629     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
630     "Q", "input poll mode freeze time");
631 
632 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
633 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
634 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
635 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
636     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
637     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
638     "Q", "input poll sampling time");
639 
640 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
641 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
642     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
643     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
644     "Q", "input poll interval (time)");
645 
646 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
647 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
648 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
649     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
650     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
651 
652 #define IF_RXPOLL_WLOWAT        10
653 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
654 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
655     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
656     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
657     "I", "input poll wakeup low watermark");
658 
659 #define IF_RXPOLL_WHIWAT        100
660 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
661 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
662     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
663     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
664     "I", "input poll wakeup high watermark");
665 
666 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
667 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
668     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
669     "max packets per poll call");
670 
671 u_int32_t if_rxpoll = 1;
672 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
673     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
674     sysctl_rxpoll, "I", "enable opportunistic input polling");
675 
676 #if TEST_INPUT_THREAD_TERMINATION
677 static u_int32_t if_input_thread_termination_spin = 0;
678 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
679     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
680     &if_input_thread_termination_spin, 0,
681     sysctl_input_thread_termination_spin,
682     "I", "input thread termination spin limit");
683 #endif /* TEST_INPUT_THREAD_TERMINATION */
684 
685 static u_int32_t cur_dlil_input_threads = 0;
686 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
687     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
688     "Current number of DLIL input threads");
689 
690 #if IFNET_INPUT_SANITY_CHK
691 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
692     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
693     "Turn on sanity checking in DLIL input");
694 #endif /* IFNET_INPUT_SANITY_CHK */
695 
696 static u_int32_t if_flowadv = 1;
697 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
698     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
699     "enable flow-advisory mechanism");
700 
701 static u_int32_t if_delaybased_queue = 1;
702 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
703     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
704     "enable delay based dynamic queue sizing");
705 
706 static uint64_t hwcksum_in_invalidated = 0;
707 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
708     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
709     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
710 
711 uint32_t hwcksum_dbg = 0;
712 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
713     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
714     "enable hardware cksum debugging");
715 
716 u_int32_t ifnet_start_delayed = 0;
717 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
718     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
719     "number of times start was delayed");
720 
721 u_int32_t ifnet_delay_start_disabled = 0;
722 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
723     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
724     "number of times start was delayed");
725 
726 static inline void
ifnet_delay_start_disabled_increment(void)727 ifnet_delay_start_disabled_increment(void)
728 {
729 	OSIncrementAtomic(&ifnet_delay_start_disabled);
730 }
731 
732 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
733 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
734 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
735 #define HWCKSUM_DBG_MASK \
736 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
737 	HWCKSUM_DBG_FINALIZE_FORCED)
738 
739 static uint32_t hwcksum_dbg_mode = 0;
740 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
741     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
742     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
743 
744 static uint64_t hwcksum_dbg_partial_forced = 0;
745 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
746     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
747     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
748 
749 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
750 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
751     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
752     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
753 
754 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
755 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
756     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
757     &hwcksum_dbg_partial_rxoff_forced, 0,
758     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
759     "forced partial cksum rx offset");
760 
761 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
762 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
763     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
764     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
765     "adjusted partial cksum rx offset");
766 
767 static uint64_t hwcksum_dbg_verified = 0;
768 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
769     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
770     &hwcksum_dbg_verified, "packets verified for having good checksum");
771 
772 static uint64_t hwcksum_dbg_bad_cksum = 0;
773 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
774     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
775     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
776 
777 static uint64_t hwcksum_dbg_bad_rxoff = 0;
778 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
779     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
780     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
781 
782 static uint64_t hwcksum_dbg_adjusted = 0;
783 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
784     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
785     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
786 
787 static uint64_t hwcksum_dbg_finalized_hdr = 0;
788 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
789     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
790     &hwcksum_dbg_finalized_hdr, "finalized headers");
791 
792 static uint64_t hwcksum_dbg_finalized_data = 0;
793 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
794     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
795     &hwcksum_dbg_finalized_data, "finalized payloads");
796 
797 uint32_t hwcksum_tx = 1;
798 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
799     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
800     "enable transmit hardware checksum offload");
801 
802 uint32_t hwcksum_rx = 1;
803 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
804     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
805     "enable receive hardware checksum offload");
806 
807 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
808     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
809     sysctl_tx_chain_len_stats, "S", "");
810 
811 uint32_t tx_chain_len_count = 0;
812 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
813     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
814 
815 static uint32_t threshold_notify = 1;           /* enable/disable */
816 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
817     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
818 
819 static uint32_t threshold_interval = 2;         /* in seconds */
820 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
821     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
822 
823 #if (DEVELOPMENT || DEBUG)
824 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
825 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
826     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
827 #endif /* DEVELOPMENT || DEBUG */
828 
829 struct net_api_stats net_api_stats;
830 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
831     &net_api_stats, net_api_stats, "");
832 
833 uint32_t net_wake_pkt_debug = 0;
834 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
835     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
836 
837 static void log_hexdump(void *data, size_t len);
838 
839 unsigned int net_rxpoll = 1;
840 unsigned int net_affinity = 1;
841 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
842 
843 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
844 
845 extern u_int32_t        inject_buckets;
846 
847 /* DLIL data threshold thread call */
848 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
849 
850 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)851 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
852 {
853 	/*
854 	 * update filter count and route_generation ID to let TCP
855 	 * know it should reevalute doing TSO or not
856 	 */
857 	if (filter_enable) {
858 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
859 	} else {
860 		VERIFY(ifp->if_flt_no_tso_count != 0);
861 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
862 	}
863 	routegenid_update();
864 }
865 
866 #if SKYWALK
867 
868 #if defined(XNU_TARGET_OS_OSX)
869 static bool net_check_compatible_if_filter(struct ifnet *ifp);
870 #endif /* XNU_TARGET_OS_OSX */
871 
872 /* if_attach_nx flags defined in os_skywalk_private.h */
873 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
874 unsigned int if_enable_fsw_ip_netagent =
875     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
876 unsigned int if_enable_fsw_transport_netagent =
877     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
878 
879 unsigned int if_netif_all =
880     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
881 
882 /* Configure flowswitch to use max mtu sized buffer */
883 static bool fsw_use_max_mtu_buffer = false;
884 
885 #if (DEVELOPMENT || DEBUG)
886 static int
887 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
888 {
889 #pragma unused(oidp, arg1, arg2)
890 	unsigned int new_value;
891 	int changed;
892 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
893 	    &new_value, &changed);
894 	if (error) {
895 		return error;
896 	}
897 	if (changed) {
898 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
899 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
900 			return ENOTSUP;
901 		}
902 		if_attach_nx = new_value;
903 	}
904 	return 0;
905 }
906 
907 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
908     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
909     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
910 
911 #endif /* DEVELOPMENT || DEBUG */
912 
913 static int
914 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
915 {
916 #pragma unused(oidp, arg1, arg2)
917 	unsigned int new_value;
918 	int changed;
919 	int error;
920 
921 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
922 	    sizeof(if_enable_fsw_transport_netagent),
923 	    &new_value, &changed);
924 	if (error == 0 && changed != 0) {
925 		if (new_value != 0 && new_value != 1) {
926 			/* only allow 0 or 1 */
927 			error = EINVAL;
928 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
929 			/* netagent can be enabled/disabled */
930 			if_enable_fsw_transport_netagent = new_value;
931 			if (new_value == 0) {
932 				kern_nexus_deregister_netagents();
933 			} else {
934 				kern_nexus_register_netagents();
935 			}
936 		} else {
937 			/* netagent can't be enabled */
938 			error = ENOTSUP;
939 		}
940 	}
941 	return error;
942 }
943 
944 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
945     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
946     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
947     "enable flowswitch netagent");
948 
949 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
950 
951 #include <skywalk/os_skywalk_private.h>
952 
953 boolean_t
ifnet_nx_noauto(ifnet_t ifp)954 ifnet_nx_noauto(ifnet_t ifp)
955 {
956 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
957 }
958 
959 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)960 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
961 {
962 	return ifnet_is_low_latency(ifp);
963 }
964 
965 boolean_t
ifnet_is_low_latency(ifnet_t ifp)966 ifnet_is_low_latency(ifnet_t ifp)
967 {
968 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
969 }
970 
971 boolean_t
ifnet_needs_compat(ifnet_t ifp)972 ifnet_needs_compat(ifnet_t ifp)
973 {
974 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
975 		return FALSE;
976 	}
977 #if !XNU_TARGET_OS_OSX
978 	/*
979 	 * To conserve memory, we plumb in the compat layer selectively; this
980 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
981 	 * In particular, we check for Wi-Fi Access Point.
982 	 */
983 	if (IFNET_IS_WIFI(ifp)) {
984 		/* Wi-Fi Access Point */
985 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
986 		    ifp->if_name[2] == '\0') {
987 			return if_netif_all;
988 		}
989 	}
990 #else /* XNU_TARGET_OS_OSX */
991 #pragma unused(ifp)
992 #endif /* XNU_TARGET_OS_OSX */
993 	return TRUE;
994 }
995 
996 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)997 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
998 {
999 	if (if_is_fsw_transport_netagent_enabled()) {
1000 		/* check if netagent has been manually enabled for ipsec/utun */
1001 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1002 			return ipsec_interface_needs_netagent(ifp);
1003 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1004 			return utun_interface_needs_netagent(ifp);
1005 		}
1006 
1007 		/* check ifnet no auto nexus override */
1008 		if (ifnet_nx_noauto(ifp)) {
1009 			return FALSE;
1010 		}
1011 
1012 		/* check global if_attach_nx configuration */
1013 		switch (ifp->if_family) {
1014 		case IFNET_FAMILY_CELLULAR:
1015 		case IFNET_FAMILY_ETHERNET:
1016 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1017 				return TRUE;
1018 			}
1019 			break;
1020 		default:
1021 			break;
1022 		}
1023 	}
1024 	return FALSE;
1025 }
1026 
1027 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1028 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1029 {
1030 #pragma unused(ifp)
1031 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1032 		return TRUE;
1033 	}
1034 	return FALSE;
1035 }
1036 
1037 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1038 ifnet_needs_netif_netagent(ifnet_t ifp)
1039 {
1040 #pragma unused(ifp)
1041 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1042 }
1043 
1044 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1045 dlil_detach_nexus_instance(nexus_controller_t controller,
1046     const char *func_str, uuid_t instance, uuid_t device)
1047 {
1048 	errno_t         err;
1049 
1050 	if (instance == NULL || uuid_is_null(instance)) {
1051 		return FALSE;
1052 	}
1053 
1054 	/* followed by the device port */
1055 	if (device != NULL && !uuid_is_null(device)) {
1056 		err = kern_nexus_ifdetach(controller, instance, device);
1057 		if (err != 0) {
1058 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1059 			    func_str, err);
1060 		}
1061 	}
1062 	err = kern_nexus_controller_free_provider_instance(controller,
1063 	    instance);
1064 	if (err != 0) {
1065 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1066 		    func_str, err);
1067 	}
1068 	return TRUE;
1069 }
1070 
1071 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1072 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1073     uuid_t device)
1074 {
1075 	boolean_t               detached = FALSE;
1076 	nexus_controller_t      controller = kern_nexus_shared_controller();
1077 	int                     err;
1078 
1079 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1080 	    device)) {
1081 		detached = TRUE;
1082 	}
1083 	if (provider != NULL && !uuid_is_null(provider)) {
1084 		detached = TRUE;
1085 		err = kern_nexus_controller_deregister_provider(controller,
1086 		    provider);
1087 		if (err != 0) {
1088 			DLIL_PRINTF("%s deregister_provider %d\n",
1089 			    func_str, err);
1090 		}
1091 	}
1092 	return detached;
1093 }
1094 
1095 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1096 dlil_create_provider_and_instance(nexus_controller_t controller,
1097     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1098     nexus_attr_t attr)
1099 {
1100 	uuid_t          dom_prov;
1101 	errno_t         err;
1102 	nexus_name_t    provider_name;
1103 	const char      *type_name =
1104 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1105 	struct kern_nexus_init init;
1106 
1107 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1108 	if (err != 0) {
1109 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1110 		    __func__, type_name, err);
1111 		goto failed;
1112 	}
1113 
1114 	snprintf((char *)provider_name, sizeof(provider_name),
1115 	    "com.apple.%s.%s", type_name, if_name(ifp));
1116 	err = kern_nexus_controller_register_provider(controller,
1117 	    dom_prov,
1118 	    provider_name,
1119 	    NULL,
1120 	    0,
1121 	    attr,
1122 	    provider);
1123 	if (err != 0) {
1124 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1125 		    __func__, type_name, err);
1126 		goto failed;
1127 	}
1128 	bzero(&init, sizeof(init));
1129 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1130 	err = kern_nexus_controller_alloc_provider_instance(controller,
1131 	    *provider,
1132 	    NULL, NULL,
1133 	    instance, &init);
1134 	if (err != 0) {
1135 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1136 		    __func__, type_name, err);
1137 		kern_nexus_controller_deregister_provider(controller,
1138 		    *provider);
1139 		goto failed;
1140 	}
1141 failed:
1142 	return err;
1143 }
1144 
1145 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1146 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1147 {
1148 	nexus_attr_t            attr = NULL;
1149 	nexus_controller_t      controller;
1150 	errno_t                 err;
1151 
1152 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1153 		/* it's already attached */
1154 		if (dlil_verbose) {
1155 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1156 			    __func__, if_name(ifp));
1157 			/* already attached */
1158 		}
1159 		goto failed;
1160 	}
1161 
1162 	err = kern_nexus_attr_create(&attr);
1163 	if (err != 0) {
1164 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1165 		    if_name(ifp));
1166 		goto failed;
1167 	}
1168 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1169 	VERIFY(err == 0);
1170 
1171 	controller = kern_nexus_shared_controller();
1172 
1173 	/* create the netif provider and instance */
1174 	err = dlil_create_provider_and_instance(controller,
1175 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1176 	    &netif_nx->if_nif_instance, attr);
1177 	if (err != 0) {
1178 		goto failed;
1179 	}
1180 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1181 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1182 	if (err != 0) {
1183 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1184 		    __func__, err);
1185 		/* cleanup provider and instance */
1186 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1187 		    netif_nx->if_nif_instance, NULL);
1188 		goto failed;
1189 	}
1190 	return TRUE;
1191 
1192 failed:
1193 	if (attr != NULL) {
1194 		kern_nexus_attr_destroy(attr);
1195 	}
1196 	return FALSE;
1197 }
1198 
1199 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1200 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1201 {
1202 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1203 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1204 		goto failed;
1205 	}
1206 	switch (ifp->if_type) {
1207 	case IFT_CELLULAR:
1208 	case IFT_ETHER:
1209 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1210 			/* don't auto-attach */
1211 			goto failed;
1212 		}
1213 		break;
1214 	default:
1215 		/* don't auto-attach */
1216 		goto failed;
1217 	}
1218 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1219 
1220 failed:
1221 	return FALSE;
1222 }
1223 
1224 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1225 dlil_is_native_netif_nexus(ifnet_t ifp)
1226 {
1227 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1228 }
1229 
1230 __attribute__((noinline))
1231 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1232 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1233 {
1234 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1235 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1236 }
1237 
1238 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1239 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1240 {
1241 	struct ifreq        ifr;
1242 	int                 error;
1243 
1244 	bzero(&ifr, sizeof(ifr));
1245 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1246 	if (error == 0) {
1247 		*ifdm_p = ifr.ifr_devmtu;
1248 	}
1249 	return error;
1250 }
1251 
1252 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1253 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1254 {
1255 #ifdef XNU_TARGET_OS_OSX
1256 	uint32_t tso_v4_mtu = 0;
1257 	uint32_t tso_v6_mtu = 0;
1258 
1259 	if (!dlil_is_native_netif_nexus(ifp)) {
1260 		return;
1261 	}
1262 	/*
1263 	 * Note that we are reading the real hwassist flags set by the driver
1264 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1265 	 * hasn't been called yet.
1266 	 */
1267 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1268 		tso_v4_mtu = ifp->if_tso_v4_mtu;
1269 	}
1270 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1271 		tso_v6_mtu = ifp->if_tso_v6_mtu;
1272 	}
1273 	/*
1274 	 * If the hardware supports TSO, adjust the large buf size to match the
1275 	 * supported TSO MTU size.
1276 	 */
1277 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1278 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1279 	} else {
1280 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1281 	}
1282 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1283 #else
1284 #pragma unused(ifp, large_buf_size)
1285 #endif /* XNU_TARGET_OS_OSX */
1286 }
1287 
1288 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1289 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1290     bool *use_multi_buflet, uint32_t *large_buf_size)
1291 {
1292 	struct kern_pbufpool_memory_info rx_pp_info;
1293 	struct kern_pbufpool_memory_info tx_pp_info;
1294 	uint32_t if_max_mtu = 0;
1295 	uint32_t drv_buf_size;
1296 	struct ifdevmtu ifdm;
1297 	int err;
1298 
1299 	/*
1300 	 * To perform intra-stack RX aggregation flowswitch needs to use
1301 	 * multi-buflet packet.
1302 	 */
1303 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1304 
1305 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1306 	/*
1307 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1308 	 * but the driver advertises the MAX MTU as only 9K.
1309 	 */
1310 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1311 		if_max_mtu = IP_MAXPACKET;
1312 		goto skip_mtu_ioctl;
1313 	}
1314 
1315 	/* determine max mtu */
1316 	bzero(&ifdm, sizeof(ifdm));
1317 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1318 	if (__improbable(err != 0)) {
1319 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1320 		    __func__, if_name(ifp));
1321 		/* use default flowswitch buffer size */
1322 		if_max_mtu = NX_FSW_BUFSIZE;
1323 	} else {
1324 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1325 		    ifdm.ifdm_max, ifdm.ifdm_current);
1326 		/* rdar://problem/44589731 */
1327 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1328 	}
1329 
1330 skip_mtu_ioctl:
1331 	if (if_max_mtu == 0) {
1332 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1333 		    __func__, if_name(ifp));
1334 		return EINVAL;
1335 	}
1336 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1337 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1338 		    "max bufsize(%d)\n", __func__,
1339 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1340 		return EINVAL;
1341 	}
1342 
1343 	/*
1344 	 * for skywalk native driver, consult the driver packet pool also.
1345 	 */
1346 	if (dlil_is_native_netif_nexus(ifp)) {
1347 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1348 		    &tx_pp_info);
1349 		if (err != 0) {
1350 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1351 			    __func__, if_name(ifp));
1352 			return ENXIO;
1353 		}
1354 		drv_buf_size = tx_pp_info.kpm_bufsize *
1355 		    tx_pp_info.kpm_max_frags;
1356 		if (if_max_mtu > drv_buf_size) {
1357 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1358 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1359 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1360 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1361 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1362 			return EINVAL;
1363 		}
1364 	} else {
1365 		drv_buf_size = if_max_mtu;
1366 	}
1367 
1368 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1369 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1370 		*use_multi_buflet = true;
1371 		/* default flowswitch buffer size */
1372 		*buf_size = NX_FSW_BUFSIZE;
1373 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1374 	} else {
1375 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1376 	}
1377 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1378 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1379 	if (*buf_size >= *large_buf_size) {
1380 		*large_buf_size = 0;
1381 	}
1382 	return 0;
1383 }
1384 
1385 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1386 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1387 {
1388 	nexus_attr_t            attr = NULL;
1389 	nexus_controller_t      controller;
1390 	errno_t                 err = 0;
1391 	uuid_t                  netif;
1392 	uint32_t                buf_size = 0;
1393 	uint32_t                large_buf_size = 0;
1394 	bool                    multi_buflet;
1395 
1396 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1397 	    IFNET_IS_VMNET(ifp)) {
1398 		goto failed;
1399 	}
1400 
1401 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1402 		/* not possible to attach (netif native/compat not plumbed) */
1403 		goto failed;
1404 	}
1405 
1406 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1407 		/* don't auto-attach */
1408 		goto failed;
1409 	}
1410 
1411 	/* get the netif instance from the ifp */
1412 	err = kern_nexus_get_netif_instance(ifp, netif);
1413 	if (err != 0) {
1414 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1415 		    if_name(ifp));
1416 		goto failed;
1417 	}
1418 
1419 	err = kern_nexus_attr_create(&attr);
1420 	if (err != 0) {
1421 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1422 		    if_name(ifp));
1423 		goto failed;
1424 	}
1425 
1426 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1427 	    &multi_buflet, &large_buf_size);
1428 	if (err != 0) {
1429 		goto failed;
1430 	}
1431 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1432 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1433 
1434 	/* Configure flowswitch buffer size */
1435 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1436 	VERIFY(err == 0);
1437 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1438 	    large_buf_size);
1439 	VERIFY(err == 0);
1440 
1441 	/*
1442 	 * Configure flowswitch to use super-packet (multi-buflet).
1443 	 */
1444 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1445 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1446 	VERIFY(err == 0);
1447 
1448 	/* create the flowswitch provider and instance */
1449 	controller = kern_nexus_shared_controller();
1450 	err = dlil_create_provider_and_instance(controller,
1451 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1452 	    &nexus_fsw->if_fsw_instance, attr);
1453 	if (err != 0) {
1454 		goto failed;
1455 	}
1456 
1457 	/* attach the device port */
1458 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1459 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1460 	if (err != 0) {
1461 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1462 		    __func__, err, if_name(ifp));
1463 		/* cleanup provider and instance */
1464 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1465 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1466 		goto failed;
1467 	}
1468 	return TRUE;
1469 
1470 failed:
1471 	if (err != 0) {
1472 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1473 		    __func__, if_name(ifp), err);
1474 	} else {
1475 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1476 		    __func__, if_name(ifp));
1477 	}
1478 	if (attr != NULL) {
1479 		kern_nexus_attr_destroy(attr);
1480 	}
1481 	return FALSE;
1482 }
1483 
1484 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1485 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1486 {
1487 	boolean_t               attached;
1488 	if_nexus_flowswitch     nexus_fsw;
1489 
1490 #if (DEVELOPMENT || DEBUG)
1491 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1492 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1493 		return FALSE;
1494 	}
1495 #endif /* (DEVELOPMENT || DEBUG) */
1496 
1497 	/*
1498 	 * flowswitch attachment is not supported for interface using the
1499 	 * legacy model (IFNET_INIT_LEGACY)
1500 	 */
1501 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1502 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1503 		    if_name(ifp));
1504 		return FALSE;
1505 	}
1506 
1507 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1508 		/* it's already attached */
1509 		return FALSE;
1510 	}
1511 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1512 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1513 	if (attached) {
1514 		ifnet_lock_exclusive(ifp);
1515 		if (!IF_FULLY_ATTACHED(ifp)) {
1516 			/* interface is going away */
1517 			attached = FALSE;
1518 		} else {
1519 			ifp->if_nx_flowswitch = nexus_fsw;
1520 		}
1521 		ifnet_lock_done(ifp);
1522 		if (!attached) {
1523 			/* clean up flowswitch nexus */
1524 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1525 		}
1526 	}
1527 	return attached;
1528 }
1529 
1530 __attribute__((noinline))
1531 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1532 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1533 {
1534 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1535 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1536 }
1537 
1538 __attribute__((noinline))
1539 static void
dlil_netif_detach_notify(ifnet_t ifp)1540 dlil_netif_detach_notify(ifnet_t ifp)
1541 {
1542 	ifnet_detach_notify_cb_t notify = NULL;
1543 	void *arg = NULL;
1544 
1545 	ifnet_get_detach_notify(ifp, &notify, &arg);
1546 	if (notify == NULL) {
1547 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1548 		return;
1549 	}
1550 	(*notify)(arg);
1551 }
1552 
1553 __attribute__((noinline))
1554 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1555 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1556 {
1557 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1558 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1559 
1560 	ifnet_datamov_suspend_and_drain(ifp);
1561 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1562 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1563 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1564 		dlil_detach_flowswitch_nexus(nx_fsw);
1565 		bzero(nx_fsw, sizeof(*nx_fsw));
1566 	} else {
1567 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1568 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1569 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1570 	}
1571 
1572 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1573 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1574 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1575 		dlil_detach_netif_nexus(nx_netif);
1576 		bzero(nx_netif, sizeof(*nx_netif));
1577 	} else {
1578 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1579 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1580 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1581 	}
1582 	ifnet_datamov_resume(ifp);
1583 }
1584 
1585 boolean_t
ifnet_add_netagent(ifnet_t ifp)1586 ifnet_add_netagent(ifnet_t ifp)
1587 {
1588 	int     error;
1589 
1590 	error = kern_nexus_interface_add_netagent(ifp);
1591 	os_log(OS_LOG_DEFAULT,
1592 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1593 	    ifp->if_xname, error);
1594 	return error == 0;
1595 }
1596 
1597 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1598 ifnet_remove_netagent(ifnet_t ifp)
1599 {
1600 	int     error;
1601 
1602 	error = kern_nexus_interface_remove_netagent(ifp);
1603 	os_log(OS_LOG_DEFAULT,
1604 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1605 	    ifp->if_xname, error);
1606 	return error == 0;
1607 }
1608 
1609 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1610 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1611 {
1612 	if (!IF_FULLY_ATTACHED(ifp)) {
1613 		return FALSE;
1614 	}
1615 	return dlil_attach_flowswitch_nexus(ifp);
1616 }
1617 
1618 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1619 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1620 {
1621 	if_nexus_flowswitch     nexus_fsw;
1622 
1623 	ifnet_lock_exclusive(ifp);
1624 	nexus_fsw = ifp->if_nx_flowswitch;
1625 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1626 	ifnet_lock_done(ifp);
1627 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1628 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1629 }
1630 
1631 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1632 ifnet_attach_netif_nexus(ifnet_t ifp)
1633 {
1634 	boolean_t       nexus_attached;
1635 	if_nexus_netif  nexus_netif;
1636 
1637 	if (!IF_FULLY_ATTACHED(ifp)) {
1638 		return FALSE;
1639 	}
1640 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1641 	if (nexus_attached) {
1642 		ifnet_lock_exclusive(ifp);
1643 		ifp->if_nx_netif = nexus_netif;
1644 		ifnet_lock_done(ifp);
1645 	}
1646 	return nexus_attached;
1647 }
1648 
1649 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1650 ifnet_detach_netif_nexus(ifnet_t ifp)
1651 {
1652 	if_nexus_netif  nexus_netif;
1653 
1654 	ifnet_lock_exclusive(ifp);
1655 	nexus_netif = ifp->if_nx_netif;
1656 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1657 	ifnet_lock_done(ifp);
1658 
1659 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1660 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1661 }
1662 
1663 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1664 ifnet_attach_native_flowswitch(ifnet_t ifp)
1665 {
1666 	if (!dlil_is_native_netif_nexus(ifp)) {
1667 		/* not a native netif */
1668 		return;
1669 	}
1670 	ifnet_attach_flowswitch_nexus(ifp);
1671 }
1672 
1673 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1674 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1675 {
1676 	lck_mtx_lock(&ifp->if_delegate_lock);
1677 	while (ifp->if_fsw_rx_cb_ref > 0) {
1678 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1679 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1680 		    (PZERO + 1), __FUNCTION__, NULL);
1681 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1682 	}
1683 	ifp->if_fsw_rx_cb = cb;
1684 	ifp->if_fsw_rx_cb_arg = arg;
1685 	lck_mtx_unlock(&ifp->if_delegate_lock);
1686 	return 0;
1687 }
1688 
1689 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1690 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1691 {
1692 	/*
1693 	 * This is for avoiding the unnecessary lock acquire for interfaces
1694 	 * not used by a redirect interface.
1695 	 */
1696 	if (ifp->if_fsw_rx_cb == NULL) {
1697 		return ENOENT;
1698 	}
1699 	lck_mtx_lock(&ifp->if_delegate_lock);
1700 	if (ifp->if_fsw_rx_cb == NULL) {
1701 		lck_mtx_unlock(&ifp->if_delegate_lock);
1702 		return ENOENT;
1703 	}
1704 	*cbp = ifp->if_fsw_rx_cb;
1705 	*argp = ifp->if_fsw_rx_cb_arg;
1706 	ifp->if_fsw_rx_cb_ref++;
1707 	lck_mtx_unlock(&ifp->if_delegate_lock);
1708 	return 0;
1709 }
1710 
1711 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1712 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1713 {
1714 	lck_mtx_lock(&ifp->if_delegate_lock);
1715 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1716 		wakeup(&ifp->if_fsw_rx_cb_ref);
1717 	}
1718 	lck_mtx_unlock(&ifp->if_delegate_lock);
1719 }
1720 
1721 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1722 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1723 {
1724 	lck_mtx_lock(&difp->if_delegate_lock);
1725 	while (difp->if_delegate_parent_ref > 0) {
1726 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1727 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1728 		    (PZERO + 1), __FUNCTION__, NULL);
1729 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1730 	}
1731 	difp->if_delegate_parent = parent;
1732 	lck_mtx_unlock(&difp->if_delegate_lock);
1733 	return 0;
1734 }
1735 
1736 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1737 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1738 {
1739 	lck_mtx_lock(&difp->if_delegate_lock);
1740 	if (difp->if_delegate_parent == NULL) {
1741 		lck_mtx_unlock(&difp->if_delegate_lock);
1742 		return ENOENT;
1743 	}
1744 	*parentp = difp->if_delegate_parent;
1745 	difp->if_delegate_parent_ref++;
1746 	lck_mtx_unlock(&difp->if_delegate_lock);
1747 	return 0;
1748 }
1749 
1750 void
ifnet_release_delegate_parent(ifnet_t difp)1751 ifnet_release_delegate_parent(ifnet_t difp)
1752 {
1753 	lck_mtx_lock(&difp->if_delegate_lock);
1754 	if (--difp->if_delegate_parent_ref == 0) {
1755 		wakeup(&difp->if_delegate_parent_ref);
1756 	}
1757 	lck_mtx_unlock(&difp->if_delegate_lock);
1758 }
1759 
1760 __attribute__((noinline))
1761 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1762 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1763 {
1764 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1765 	ifp->if_detach_notify = notify;
1766 	ifp->if_detach_notify_arg = arg;
1767 }
1768 
1769 __attribute__((noinline))
1770 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1771 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1772 {
1773 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1774 	*notifyp = ifp->if_detach_notify;
1775 	*argp = ifp->if_detach_notify_arg;
1776 }
1777 
1778 __attribute__((noinline))
1779 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1780 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1781 {
1782 	ifnet_lock_exclusive(ifp);
1783 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1784 	ifnet_lock_done(ifp);
1785 }
1786 
1787 __attribute__((noinline))
1788 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1789 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1790 {
1791 	ifnet_lock_exclusive(ifp);
1792 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1793 	ifnet_lock_done(ifp);
1794 }
1795 #endif /* SKYWALK */
1796 
1797 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1798 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1799 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1800 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1801 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1802 	/* NOTREACHED */                                        \
1803 	}                                                               \
1804 }
1805 
1806 #define DLIL_EWMA(old, new, decay) do {                                 \
1807 	u_int32_t _avg;                                                 \
1808 	if ((_avg = (old)) > 0)                                         \
1809 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1810 	else                                                            \
1811 	        _avg = (new);                                           \
1812 	(old) = _avg;                                                   \
1813 } while (0)
1814 
1815 #define MBPS    (1ULL * 1000 * 1000)
1816 #define GBPS    (MBPS * 1000)
1817 
1818 struct rxpoll_time_tbl {
1819 	u_int64_t       speed;          /* downlink speed */
1820 	u_int32_t       plowat;         /* packets low watermark */
1821 	u_int32_t       phiwat;         /* packets high watermark */
1822 	u_int32_t       blowat;         /* bytes low watermark */
1823 	u_int32_t       bhiwat;         /* bytes high watermark */
1824 };
1825 
1826 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1827 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1828 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1829 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1830 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1831 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1832 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1833 };
1834 
1835 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1836     &dlil_lck_attributes);
1837 static uint32_t dlil_pending_thread_cnt = 0;
1838 
1839 static void
dlil_incr_pending_thread_count(void)1840 dlil_incr_pending_thread_count(void)
1841 {
1842 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1843 	lck_mtx_lock(&dlil_thread_sync_lock);
1844 	dlil_pending_thread_cnt++;
1845 	lck_mtx_unlock(&dlil_thread_sync_lock);
1846 }
1847 
1848 static void
dlil_decr_pending_thread_count(void)1849 dlil_decr_pending_thread_count(void)
1850 {
1851 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1852 	lck_mtx_lock(&dlil_thread_sync_lock);
1853 	VERIFY(dlil_pending_thread_cnt > 0);
1854 	dlil_pending_thread_cnt--;
1855 	if (dlil_pending_thread_cnt == 0) {
1856 		wakeup(&dlil_pending_thread_cnt);
1857 	}
1858 	lck_mtx_unlock(&dlil_thread_sync_lock);
1859 }
1860 
1861 int
proto_hash_value(u_int32_t protocol_family)1862 proto_hash_value(u_int32_t protocol_family)
1863 {
1864 	/*
1865 	 * dlil_proto_unplumb_all() depends on the mapping between
1866 	 * the hash bucket index and the protocol family defined
1867 	 * here; future changes must be applied there as well.
1868 	 */
1869 	switch (protocol_family) {
1870 	case PF_INET:
1871 		return 0;
1872 	case PF_INET6:
1873 		return 1;
1874 	case PF_VLAN:
1875 		return 2;
1876 	case PF_UNSPEC:
1877 	default:
1878 		return 3;
1879 	}
1880 }
1881 
1882 /*
1883  * Caller must already be holding ifnet lock.
1884  */
1885 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1886 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1887 {
1888 	struct if_proto *proto = NULL;
1889 	u_int32_t i = proto_hash_value(protocol_family);
1890 
1891 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1892 
1893 	if (ifp->if_proto_hash != NULL) {
1894 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1895 	}
1896 
1897 	while (proto != NULL && proto->protocol_family != protocol_family) {
1898 		proto = SLIST_NEXT(proto, next_hash);
1899 	}
1900 
1901 	if (proto != NULL) {
1902 		if_proto_ref(proto);
1903 	}
1904 
1905 	return proto;
1906 }
1907 
1908 static void
if_proto_ref(struct if_proto * proto)1909 if_proto_ref(struct if_proto *proto)
1910 {
1911 	os_atomic_inc(&proto->refcount, relaxed);
1912 }
1913 
1914 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1915 
1916 static void
if_proto_free(struct if_proto * proto)1917 if_proto_free(struct if_proto *proto)
1918 {
1919 	u_int32_t oldval;
1920 	struct ifnet *ifp = proto->ifp;
1921 	u_int32_t proto_family = proto->protocol_family;
1922 	struct kev_dl_proto_data ev_pr_data;
1923 
1924 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1925 	if (oldval > 1) {
1926 		return;
1927 	}
1928 
1929 	if (proto->proto_kpi == kProtoKPI_v1) {
1930 		if (proto->kpi.v1.detached) {
1931 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1932 		}
1933 	}
1934 	if (proto->proto_kpi == kProtoKPI_v2) {
1935 		if (proto->kpi.v2.detached) {
1936 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1937 		}
1938 	}
1939 
1940 	/*
1941 	 * Cleanup routes that may still be in the routing table for that
1942 	 * interface/protocol pair.
1943 	 */
1944 	if_rtproto_del(ifp, proto_family);
1945 
1946 	ifnet_lock_shared(ifp);
1947 
1948 	/* No more reference on this, protocol must have been detached */
1949 	VERIFY(proto->detached);
1950 
1951 	/*
1952 	 * The reserved field carries the number of protocol still attached
1953 	 * (subject to change)
1954 	 */
1955 	ev_pr_data.proto_family = proto_family;
1956 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1957 
1958 	ifnet_lock_done(ifp);
1959 
1960 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1961 	    (struct net_event_data *)&ev_pr_data,
1962 	    sizeof(struct kev_dl_proto_data), FALSE);
1963 
1964 	if (ev_pr_data.proto_remaining_count == 0) {
1965 		/*
1966 		 * The protocol count has gone to zero, mark the interface down.
1967 		 * This used to be done by configd.KernelEventMonitor, but that
1968 		 * is inherently prone to races (rdar://problem/30810208).
1969 		 */
1970 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1971 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1972 		dlil_post_sifflags_msg(ifp);
1973 	}
1974 
1975 	zfree(dlif_proto_zone, proto);
1976 }
1977 
1978 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1979 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1980 {
1981 #if !MACH_ASSERT
1982 #pragma unused(ifp)
1983 #endif
1984 	unsigned int type = 0;
1985 	int ass = 1;
1986 
1987 	switch (what) {
1988 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1989 		type = LCK_RW_ASSERT_EXCLUSIVE;
1990 		break;
1991 
1992 	case IFNET_LCK_ASSERT_SHARED:
1993 		type = LCK_RW_ASSERT_SHARED;
1994 		break;
1995 
1996 	case IFNET_LCK_ASSERT_OWNED:
1997 		type = LCK_RW_ASSERT_HELD;
1998 		break;
1999 
2000 	case IFNET_LCK_ASSERT_NOTOWNED:
2001 		/* nothing to do here for RW lock; bypass assert */
2002 		ass = 0;
2003 		break;
2004 
2005 	default:
2006 		panic("bad ifnet assert type: %d", what);
2007 		/* NOTREACHED */
2008 	}
2009 	if (ass) {
2010 		LCK_RW_ASSERT(&ifp->if_lock, type);
2011 	}
2012 }
2013 
2014 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2015 ifnet_lock_shared(struct ifnet *ifp)
2016 {
2017 	lck_rw_lock_shared(&ifp->if_lock);
2018 }
2019 
2020 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2021 ifnet_lock_exclusive(struct ifnet *ifp)
2022 {
2023 	lck_rw_lock_exclusive(&ifp->if_lock);
2024 }
2025 
2026 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2027 ifnet_lock_done(struct ifnet *ifp)
2028 {
2029 	lck_rw_done(&ifp->if_lock);
2030 }
2031 
2032 #if INET
2033 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2034 if_inetdata_lock_shared(struct ifnet *ifp)
2035 {
2036 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
2037 }
2038 
2039 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2040 if_inetdata_lock_exclusive(struct ifnet *ifp)
2041 {
2042 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2043 }
2044 
2045 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2046 if_inetdata_lock_done(struct ifnet *ifp)
2047 {
2048 	lck_rw_done(&ifp->if_inetdata_lock);
2049 }
2050 #endif
2051 
2052 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2053 if_inet6data_lock_shared(struct ifnet *ifp)
2054 {
2055 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
2056 }
2057 
2058 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2059 if_inet6data_lock_exclusive(struct ifnet *ifp)
2060 {
2061 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2062 }
2063 
2064 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2065 if_inet6data_lock_done(struct ifnet *ifp)
2066 {
2067 	lck_rw_done(&ifp->if_inet6data_lock);
2068 }
2069 
2070 __private_extern__ void
ifnet_head_lock_shared(void)2071 ifnet_head_lock_shared(void)
2072 {
2073 	lck_rw_lock_shared(&ifnet_head_lock);
2074 }
2075 
2076 __private_extern__ void
ifnet_head_lock_exclusive(void)2077 ifnet_head_lock_exclusive(void)
2078 {
2079 	lck_rw_lock_exclusive(&ifnet_head_lock);
2080 }
2081 
2082 __private_extern__ void
ifnet_head_done(void)2083 ifnet_head_done(void)
2084 {
2085 	lck_rw_done(&ifnet_head_lock);
2086 }
2087 
2088 __private_extern__ void
ifnet_head_assert_exclusive(void)2089 ifnet_head_assert_exclusive(void)
2090 {
2091 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2092 }
2093 
2094 /*
2095  * dlil_ifp_protolist
2096  * - get the list of protocols attached to the interface, or just the number
2097  *   of attached protocols
2098  * - if the number returned is greater than 'list_count', truncation occurred
2099  *
2100  * Note:
2101  * - caller must already be holding ifnet lock.
2102  */
2103 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2104 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2105     u_int32_t list_count)
2106 {
2107 	u_int32_t       count = 0;
2108 	int             i;
2109 
2110 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2111 
2112 	if (ifp->if_proto_hash == NULL) {
2113 		goto done;
2114 	}
2115 
2116 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2117 		struct if_proto *proto;
2118 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2119 			if (list != NULL && count < list_count) {
2120 				list[count] = proto->protocol_family;
2121 			}
2122 			count++;
2123 		}
2124 	}
2125 done:
2126 	return count;
2127 }
2128 
2129 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2130 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2131 {
2132 	ifnet_lock_shared(ifp);
2133 	count = dlil_ifp_protolist(ifp, protolist, count);
2134 	ifnet_lock_done(ifp);
2135 	return count;
2136 }
2137 
2138 __private_extern__ void
if_free_protolist(u_int32_t * list)2139 if_free_protolist(u_int32_t *list)
2140 {
2141 	kfree_data_addr(list);
2142 }
2143 
2144 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2145 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2146     u_int32_t event_code, struct net_event_data *event_data,
2147     u_int32_t event_data_len, boolean_t suppress_generation)
2148 {
2149 	struct net_event_data ev_data;
2150 	struct kev_msg ev_msg;
2151 
2152 	bzero(&ev_msg, sizeof(ev_msg));
2153 	bzero(&ev_data, sizeof(ev_data));
2154 	/*
2155 	 * a net event always starts with a net_event_data structure
2156 	 * but the caller can generate a simple net event or
2157 	 * provide a longer event structure to post
2158 	 */
2159 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
2160 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
2161 	ev_msg.kev_subclass     = event_subclass;
2162 	ev_msg.event_code       = event_code;
2163 
2164 	if (event_data == NULL) {
2165 		event_data = &ev_data;
2166 		event_data_len = sizeof(struct net_event_data);
2167 	}
2168 
2169 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2170 	event_data->if_family = ifp->if_family;
2171 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2172 
2173 	ev_msg.dv[0].data_length = event_data_len;
2174 	ev_msg.dv[0].data_ptr    = event_data;
2175 	ev_msg.dv[1].data_length = 0;
2176 
2177 	bool update_generation = true;
2178 	if (event_subclass == KEV_DL_SUBCLASS) {
2179 		/* Don't update interface generation for frequent link quality and state changes  */
2180 		switch (event_code) {
2181 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2182 		case KEV_DL_RRC_STATE_CHANGED:
2183 		case KEV_DL_PRIMARY_ELECTED:
2184 			update_generation = false;
2185 			break;
2186 		default:
2187 			break;
2188 		}
2189 	}
2190 
2191 	/*
2192 	 * Some events that update generation counts might
2193 	 * want to suppress generation count.
2194 	 * One example is node presence/absence where we still
2195 	 * issue kernel event for the invocation but want to avoid
2196 	 * expensive operation of updating generation which triggers
2197 	 * NECP client updates.
2198 	 */
2199 	if (suppress_generation) {
2200 		update_generation = false;
2201 	}
2202 
2203 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2204 }
2205 
2206 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2207 dlil_alloc_local_stats(struct ifnet *ifp)
2208 {
2209 	int ret = EINVAL;
2210 	void *buf, *base, **pbuf;
2211 
2212 	if (ifp == NULL) {
2213 		goto end;
2214 	}
2215 
2216 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2217 		/* allocate tcpstat_local structure */
2218 		buf = zalloc_flags(dlif_tcpstat_zone,
2219 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2220 
2221 		/* Get the 64-bit aligned base address for this object */
2222 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2223 		    sizeof(u_int64_t));
2224 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2225 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2226 
2227 		/*
2228 		 * Wind back a pointer size from the aligned base and
2229 		 * save the original address so we can free it later.
2230 		 */
2231 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2232 		*pbuf = buf;
2233 		ifp->if_tcp_stat = base;
2234 
2235 		/* allocate udpstat_local structure */
2236 		buf = zalloc_flags(dlif_udpstat_zone,
2237 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2238 
2239 		/* Get the 64-bit aligned base address for this object */
2240 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2241 		    sizeof(u_int64_t));
2242 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2243 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2244 
2245 		/*
2246 		 * Wind back a pointer size from the aligned base and
2247 		 * save the original address so we can free it later.
2248 		 */
2249 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2250 		*pbuf = buf;
2251 		ifp->if_udp_stat = base;
2252 
2253 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2254 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2255 
2256 		ret = 0;
2257 	}
2258 
2259 	if (ifp->if_ipv4_stat == NULL) {
2260 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2261 	}
2262 
2263 	if (ifp->if_ipv6_stat == NULL) {
2264 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2265 	}
2266 end:
2267 	if (ifp != NULL && ret != 0) {
2268 		if (ifp->if_tcp_stat != NULL) {
2269 			pbuf = (void **)
2270 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2271 			zfree(dlif_tcpstat_zone, *pbuf);
2272 			ifp->if_tcp_stat = NULL;
2273 		}
2274 		if (ifp->if_udp_stat != NULL) {
2275 			pbuf = (void **)
2276 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2277 			zfree(dlif_udpstat_zone, *pbuf);
2278 			ifp->if_udp_stat = NULL;
2279 		}
2280 		/* The macro kfree_type sets the passed pointer to NULL */
2281 		if (ifp->if_ipv4_stat != NULL) {
2282 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2283 		}
2284 		if (ifp->if_ipv6_stat != NULL) {
2285 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2286 		}
2287 	}
2288 
2289 	return ret;
2290 }
2291 
2292 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2293 dlil_reset_rxpoll_params(ifnet_t ifp)
2294 {
2295 	ASSERT(ifp != NULL);
2296 	ifnet_set_poll_cycle(ifp, NULL);
2297 	ifp->if_poll_update = 0;
2298 	ifp->if_poll_flags = 0;
2299 	ifp->if_poll_req = 0;
2300 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2301 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2302 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2303 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2304 	net_timerclear(&ifp->if_poll_mode_holdtime);
2305 	net_timerclear(&ifp->if_poll_mode_lasttime);
2306 	net_timerclear(&ifp->if_poll_sample_holdtime);
2307 	net_timerclear(&ifp->if_poll_sample_lasttime);
2308 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2309 }
2310 
2311 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2312 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2313     thread_continue_t *thfunc)
2314 {
2315 	boolean_t dlil_rxpoll_input;
2316 	thread_continue_t func = NULL;
2317 	u_int32_t limit;
2318 	int error = 0;
2319 
2320 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2321 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2322 
2323 	/* default strategy utilizes the DLIL worker thread */
2324 	inp->dlth_strategy = dlil_input_async;
2325 
2326 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2327 	if (ifp == NULL) {
2328 		/*
2329 		 * Main input thread only.
2330 		 */
2331 		func = dlil_main_input_thread_func;
2332 		VERIFY(inp == dlil_main_input_thread);
2333 		(void) strlcat(inp->dlth_name,
2334 		    "main_input", DLIL_THREADNAME_LEN);
2335 	} else if (dlil_rxpoll_input) {
2336 		/*
2337 		 * Legacy (non-netif) hybrid polling.
2338 		 */
2339 		func = dlil_rxpoll_input_thread_func;
2340 		VERIFY(inp != dlil_main_input_thread);
2341 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2342 		    "%s_input_poll", if_name(ifp));
2343 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2344 		/*
2345 		 * Asynchronous strategy.
2346 		 */
2347 		func = dlil_input_thread_func;
2348 		VERIFY(inp != dlil_main_input_thread);
2349 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2350 		    "%s_input", if_name(ifp));
2351 	} else {
2352 		/*
2353 		 * Synchronous strategy if there's a netif below and
2354 		 * the device isn't capable of hybrid polling.
2355 		 */
2356 		ASSERT(func == NULL);
2357 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2358 		VERIFY(inp != dlil_main_input_thread);
2359 		ASSERT(!inp->dlth_affinity);
2360 		inp->dlth_strategy = dlil_input_sync;
2361 	}
2362 	VERIFY(inp->dlth_thread == THREAD_NULL);
2363 
2364 	/* let caller know */
2365 	if (thfunc != NULL) {
2366 		*thfunc = func;
2367 	}
2368 
2369 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2370 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2371 
2372 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2373 
2374 	/*
2375 	 * For interfaces that support opportunistic polling, set the
2376 	 * low and high watermarks for outstanding inbound packets/bytes.
2377 	 * Also define freeze times for transitioning between modes
2378 	 * and updating the average.
2379 	 */
2380 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2381 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2382 		if (ifp->if_xflags & IFXF_LEGACY) {
2383 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2384 		}
2385 	} else {
2386 		/*
2387 		 * For interfaces that don't support opportunistic
2388 		 * polling, set the burst limit to prevent memory exhaustion.
2389 		 * The values of `if_rcvq_burst_limit' are safeguarded
2390 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2391 		 */
2392 		limit = if_rcvq_burst_limit;
2393 	}
2394 
2395 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2396 	if (inp == dlil_main_input_thread) {
2397 		struct dlil_main_threading_info *inpm =
2398 		    (struct dlil_main_threading_info *)inp;
2399 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2400 	}
2401 
2402 	if (func == NULL) {
2403 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2404 		ASSERT(error == 0);
2405 		error = ENODEV;
2406 		goto done;
2407 	}
2408 
2409 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2410 	if (error == KERN_SUCCESS) {
2411 		thread_precedence_policy_data_t info;
2412 		__unused kern_return_t kret;
2413 
2414 		bzero(&info, sizeof(info));
2415 		info.importance = 0;
2416 		kret = thread_policy_set(inp->dlth_thread,
2417 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2418 		    THREAD_PRECEDENCE_POLICY_COUNT);
2419 		ASSERT(kret == KERN_SUCCESS);
2420 		/*
2421 		 * We create an affinity set so that the matching workloop
2422 		 * thread or the starter thread (for loopback) can be
2423 		 * scheduled on the same processor set as the input thread.
2424 		 */
2425 		if (net_affinity) {
2426 			struct thread *tp = inp->dlth_thread;
2427 			u_int32_t tag;
2428 			/*
2429 			 * Randomize to reduce the probability
2430 			 * of affinity tag namespace collision.
2431 			 */
2432 			read_frandom(&tag, sizeof(tag));
2433 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2434 				thread_reference(tp);
2435 				inp->dlth_affinity_tag = tag;
2436 				inp->dlth_affinity = TRUE;
2437 			}
2438 		}
2439 	} else if (inp == dlil_main_input_thread) {
2440 		panic_plain("%s: couldn't create main input thread", __func__);
2441 		/* NOTREACHED */
2442 	} else {
2443 		panic_plain("%s: couldn't create %s input thread", __func__,
2444 		    if_name(ifp));
2445 		/* NOTREACHED */
2446 	}
2447 	OSAddAtomic(1, &cur_dlil_input_threads);
2448 
2449 done:
2450 	return error;
2451 }
2452 
2453 #if TEST_INPUT_THREAD_TERMINATION
2454 static int
2455 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2456 {
2457 #pragma unused(arg1, arg2)
2458 	uint32_t i;
2459 	int err;
2460 
2461 	i = if_input_thread_termination_spin;
2462 
2463 	err = sysctl_handle_int(oidp, &i, 0, req);
2464 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2465 		return err;
2466 	}
2467 
2468 	if (net_rxpoll == 0) {
2469 		return ENXIO;
2470 	}
2471 
2472 	if_input_thread_termination_spin = i;
2473 	return err;
2474 }
2475 #endif /* TEST_INPUT_THREAD_TERMINATION */
2476 
2477 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2478 dlil_clean_threading_info(struct dlil_threading_info *inp)
2479 {
2480 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2481 	lck_grp_free(inp->dlth_lock_grp);
2482 	inp->dlth_lock_grp = NULL;
2483 
2484 	inp->dlth_flags = 0;
2485 	inp->dlth_wtot = 0;
2486 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2487 	inp->dlth_ifp = NULL;
2488 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2489 	qlimit(&inp->dlth_pkts) = 0;
2490 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2491 
2492 	VERIFY(!inp->dlth_affinity);
2493 	inp->dlth_thread = THREAD_NULL;
2494 	inp->dlth_strategy = NULL;
2495 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2496 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2497 	VERIFY(inp->dlth_affinity_tag == 0);
2498 #if IFNET_INPUT_SANITY_CHK
2499 	inp->dlth_pkts_cnt = 0;
2500 #endif /* IFNET_INPUT_SANITY_CHK */
2501 }
2502 
2503 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2504 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2505 {
2506 	struct ifnet *ifp = inp->dlth_ifp;
2507 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2508 
2509 	VERIFY(current_thread() == inp->dlth_thread);
2510 	VERIFY(inp != dlil_main_input_thread);
2511 
2512 	OSAddAtomic(-1, &cur_dlil_input_threads);
2513 
2514 #if TEST_INPUT_THREAD_TERMINATION
2515 	{ /* do something useless that won't get optimized away */
2516 		uint32_t        v = 1;
2517 		for (uint32_t i = 0;
2518 		    i < if_input_thread_termination_spin;
2519 		    i++) {
2520 			v = (i + 1) * v;
2521 		}
2522 		DLIL_PRINTF("the value is %d\n", v);
2523 	}
2524 #endif /* TEST_INPUT_THREAD_TERMINATION */
2525 
2526 	lck_mtx_lock_spin(&inp->dlth_lock);
2527 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2528 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2529 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2530 	wakeup_one((caddr_t)&inp->dlth_flags);
2531 	lck_mtx_unlock(&inp->dlth_lock);
2532 
2533 	/* free up pending packets */
2534 	if (pkt.cp_mbuf != NULL) {
2535 		mbuf_freem_list(pkt.cp_mbuf);
2536 	}
2537 
2538 	/* for the extra refcnt from kernel_thread_start() */
2539 	thread_deallocate(current_thread());
2540 
2541 	if (dlil_verbose) {
2542 		DLIL_PRINTF("%s: input thread terminated\n",
2543 		    if_name(ifp));
2544 	}
2545 
2546 	/* this is the end */
2547 	thread_terminate(current_thread());
2548 	/* NOTREACHED */
2549 }
2550 
2551 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2552 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2553 {
2554 	thread_affinity_policy_data_t policy;
2555 
2556 	bzero(&policy, sizeof(policy));
2557 	policy.affinity_tag = tag;
2558 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2559 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2560 }
2561 
2562 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2563 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2564 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2565     enum net_filter_event_subsystems state)
2566 {
2567 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2568 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2569 		if_enable_fsw_transport_netagent = 1;
2570 	} else {
2571 		if_enable_fsw_transport_netagent = 0;
2572 	}
2573 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2574 		kern_nexus_update_netagents();
2575 	} else if (!if_enable_fsw_transport_netagent) {
2576 		necp_update_all_clients();
2577 	}
2578 }
2579 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2580 
2581 void
dlil_init(void)2582 dlil_init(void)
2583 {
2584 	thread_t thread = THREAD_NULL;
2585 
2586 	/*
2587 	 * The following fields must be 64-bit aligned for atomic operations.
2588 	 */
2589 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2590 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2591 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2592 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2593 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2594 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2595 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2596 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2597 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2598 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2599 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2600 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2601 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2602 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2603 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2604 
2605 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2606 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2607 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2608 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2609 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2610 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2611 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2612 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2613 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2614 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2615 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2616 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2617 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2618 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2619 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2620 
2621 	/*
2622 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2623 	 */
2624 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2625 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2626 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2627 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2628 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2629 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2630 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2631 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2632 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2633 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2634 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2635 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2636 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2637 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2638 
2639 	/*
2640 	 * ... as well as the mbuf checksum flags counterparts.
2641 	 */
2642 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2643 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2644 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2645 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2646 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2647 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2648 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2649 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2650 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2651 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2652 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2653 
2654 	/*
2655 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2656 	 */
2657 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2658 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2659 
2660 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2661 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2662 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2663 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2664 
2665 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2666 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2667 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2668 
2669 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2670 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2671 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2672 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2673 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2674 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2675 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2676 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2677 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2678 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2679 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2680 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2681 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2682 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2683 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2684 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2685 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2686 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2687 
2688 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2689 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2690 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2691 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2692 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2693 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2694 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2695 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2696 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2697 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2698 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2699 
2700 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2701 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2702 
2703 	PE_parse_boot_argn("net_affinity", &net_affinity,
2704 	    sizeof(net_affinity));
2705 
2706 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2707 
2708 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2709 
2710 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2711 
2712 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2713 
2714 	VERIFY(dlil_pending_thread_cnt == 0);
2715 #if SKYWALK
2716 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2717 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2718 	boolean_t enable_fsw_netagent =
2719 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2720 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2721 
2722 	/*
2723 	 * Check the device tree to see if Skywalk netagent has been explicitly
2724 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2725 	 * Note that the property is a 0-length key, and so checking for the
2726 	 * presence itself is enough (no need to check for the actual value of
2727 	 * the retrieved variable.)
2728 	 */
2729 	pe_enable_fsw_transport_netagent =
2730 	    PE_get_default("kern.skywalk_netagent_enable",
2731 	    &pe_enable_fsw_transport_netagent,
2732 	    sizeof(pe_enable_fsw_transport_netagent));
2733 	pe_disable_fsw_transport_netagent =
2734 	    PE_get_default("kern.skywalk_netagent_disable",
2735 	    &pe_disable_fsw_transport_netagent,
2736 	    sizeof(pe_disable_fsw_transport_netagent));
2737 
2738 	/*
2739 	 * These two are mutually exclusive, i.e. they both can be absent,
2740 	 * but only one can be present at a time, and so we assert to make
2741 	 * sure it is correct.
2742 	 */
2743 	VERIFY((!pe_enable_fsw_transport_netagent &&
2744 	    !pe_disable_fsw_transport_netagent) ||
2745 	    (pe_enable_fsw_transport_netagent ^
2746 	    pe_disable_fsw_transport_netagent));
2747 
2748 	if (pe_enable_fsw_transport_netagent) {
2749 		kprintf("SK: netagent is enabled via an override for "
2750 		    "this platform\n");
2751 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2752 	} else if (pe_disable_fsw_transport_netagent) {
2753 		kprintf("SK: netagent is disabled via an override for "
2754 		    "this platform\n");
2755 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2756 	} else {
2757 		kprintf("SK: netagent is %s by default for this platform\n",
2758 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2759 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2760 	}
2761 
2762 	/*
2763 	 * Now see if there's a boot-arg override.
2764 	 */
2765 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2766 	    sizeof(if_attach_nx));
2767 	if_enable_fsw_transport_netagent =
2768 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2769 
2770 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2771 
2772 	if (pe_disable_fsw_transport_netagent &&
2773 	    if_enable_fsw_transport_netagent) {
2774 		kprintf("SK: netagent is force-enabled\n");
2775 	} else if (!pe_disable_fsw_transport_netagent &&
2776 	    !if_enable_fsw_transport_netagent) {
2777 		kprintf("SK: netagent is force-disabled\n");
2778 	}
2779 #ifdef XNU_TARGET_OS_OSX
2780 	if (if_enable_fsw_transport_netagent) {
2781 		net_filter_event_register(dlil_filter_event);
2782 	}
2783 #endif /* XNU_TARGET_OS_OSX */
2784 
2785 #if (DEVELOPMENT || DEBUG)
2786 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2787 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2788 #endif /* (DEVELOPMENT || DEBUG) */
2789 
2790 #endif /* SKYWALK */
2791 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2792 	    sizeof(struct dlil_ifnet_dbg);
2793 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2794 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2795 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2796 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2797 
2798 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2799 	/* Enforce 64-bit alignment for tcpstat_local structure */
2800 	dlif_tcpstat_bufsize =
2801 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2802 	dlif_tcpstat_bufsize = (uint32_t)
2803 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2804 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2805 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2806 
2807 	dlif_udpstat_size = sizeof(struct udpstat_local);
2808 	/* Enforce 64-bit alignment for udpstat_local structure */
2809 	dlif_udpstat_bufsize =
2810 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2811 	dlif_udpstat_bufsize = (uint32_t)
2812 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2813 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2814 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2815 
2816 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2817 
2818 	TAILQ_INIT(&dlil_ifnet_head);
2819 	TAILQ_INIT(&ifnet_head);
2820 	TAILQ_INIT(&ifnet_detaching_head);
2821 	TAILQ_INIT(&ifnet_ordered_head);
2822 
2823 	/* Initialize interface address subsystem */
2824 	ifa_init();
2825 
2826 #if PF
2827 	/* Initialize the packet filter */
2828 	pfinit();
2829 #endif /* PF */
2830 
2831 	/* Initialize queue algorithms */
2832 	classq_init();
2833 
2834 	/* Initialize packet schedulers */
2835 	pktsched_init();
2836 
2837 	/* Initialize flow advisory subsystem */
2838 	flowadv_init();
2839 
2840 	/* Initialize the pktap virtual interface */
2841 	pktap_init();
2842 
2843 	/* Initialize the service class to dscp map */
2844 	net_qos_map_init();
2845 
2846 	/* Initialize the interface low power mode event handler */
2847 	if_low_power_evhdlr_init();
2848 
2849 	/* Initialize the interface offload port list subsystem */
2850 	if_ports_used_init();
2851 
2852 #if DEBUG || DEVELOPMENT
2853 	/* Run self-tests */
2854 	dlil_verify_sum16();
2855 #endif /* DEBUG || DEVELOPMENT */
2856 
2857 	/*
2858 	 * Create and start up the main DLIL input thread and the interface
2859 	 * detacher threads once everything is initialized.
2860 	 */
2861 	dlil_incr_pending_thread_count();
2862 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2863 
2864 	/*
2865 	 * Create ifnet detacher thread.
2866 	 * When an interface gets detached, part of the detach processing
2867 	 * is delayed. The interface is added to delayed detach list
2868 	 * and this thread is woken up to call ifnet_detach_final
2869 	 * on these interfaces.
2870 	 */
2871 	dlil_incr_pending_thread_count();
2872 	if (kernel_thread_start(ifnet_detacher_thread_func,
2873 	    NULL, &thread) != KERN_SUCCESS) {
2874 		panic_plain("%s: couldn't create detacher thread", __func__);
2875 		/* NOTREACHED */
2876 	}
2877 	thread_deallocate(thread);
2878 
2879 	/*
2880 	 * Wait for the created kernel threads for dlil to get
2881 	 * scheduled and run at least once before we proceed
2882 	 */
2883 	lck_mtx_lock(&dlil_thread_sync_lock);
2884 	while (dlil_pending_thread_cnt != 0) {
2885 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2886 		    "threads to get scheduled at least once.\n", __func__);
2887 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2888 		    (PZERO - 1), __func__, NULL);
2889 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2890 	}
2891 	lck_mtx_unlock(&dlil_thread_sync_lock);
2892 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2893 	    "scheduled at least once. Proceeding.\n", __func__);
2894 }
2895 
2896 static void
if_flt_monitor_busy(struct ifnet * ifp)2897 if_flt_monitor_busy(struct ifnet *ifp)
2898 {
2899 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2900 
2901 	++ifp->if_flt_busy;
2902 	VERIFY(ifp->if_flt_busy != 0);
2903 }
2904 
2905 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2906 if_flt_monitor_unbusy(struct ifnet *ifp)
2907 {
2908 	if_flt_monitor_leave(ifp);
2909 }
2910 
2911 static void
if_flt_monitor_enter(struct ifnet * ifp)2912 if_flt_monitor_enter(struct ifnet *ifp)
2913 {
2914 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2915 
2916 	while (ifp->if_flt_busy) {
2917 		++ifp->if_flt_waiters;
2918 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2919 		    (PZERO - 1), "if_flt_monitor", NULL);
2920 	}
2921 	if_flt_monitor_busy(ifp);
2922 }
2923 
2924 static void
if_flt_monitor_leave(struct ifnet * ifp)2925 if_flt_monitor_leave(struct ifnet *ifp)
2926 {
2927 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2928 
2929 	VERIFY(ifp->if_flt_busy != 0);
2930 	--ifp->if_flt_busy;
2931 
2932 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2933 		ifp->if_flt_waiters = 0;
2934 		wakeup(&ifp->if_flt_head);
2935 	}
2936 }
2937 
2938 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2939 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2940     interface_filter_t *filter_ref, u_int32_t flags)
2941 {
2942 	int retval = 0;
2943 	struct ifnet_filter *filter = NULL;
2944 
2945 	ifnet_head_lock_shared();
2946 
2947 	/* Check that the interface is in the global list */
2948 	if (!ifnet_lookup(ifp)) {
2949 		retval = ENXIO;
2950 		goto done;
2951 	}
2952 	if (!ifnet_is_attached(ifp, 1)) {
2953 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2954 		    __func__, if_name(ifp));
2955 		retval = ENXIO;
2956 		goto done;
2957 	}
2958 
2959 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2960 
2961 	/* refcnt held above during lookup */
2962 	filter->filt_flags = flags;
2963 	filter->filt_ifp = ifp;
2964 	filter->filt_cookie = if_filter->iff_cookie;
2965 	filter->filt_name = if_filter->iff_name;
2966 	filter->filt_protocol = if_filter->iff_protocol;
2967 	/*
2968 	 * Do not install filter callbacks for internal coproc interface
2969 	 * and for management interfaces
2970 	 */
2971 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2972 		filter->filt_input = if_filter->iff_input;
2973 		filter->filt_output = if_filter->iff_output;
2974 		filter->filt_event = if_filter->iff_event;
2975 		filter->filt_ioctl = if_filter->iff_ioctl;
2976 	}
2977 	filter->filt_detached = if_filter->iff_detached;
2978 
2979 	lck_mtx_lock(&ifp->if_flt_lock);
2980 	if_flt_monitor_enter(ifp);
2981 
2982 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2983 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2984 
2985 	*filter_ref = filter;
2986 
2987 	/*
2988 	 * Bump filter count and route_generation ID to let TCP
2989 	 * know it shouldn't do TSO on this connection
2990 	 */
2991 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2992 		ifnet_filter_update_tso(ifp, TRUE);
2993 	}
2994 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2995 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2996 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2997 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2998 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2999 	} else {
3000 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
3001 	}
3002 	if_flt_monitor_leave(ifp);
3003 	lck_mtx_unlock(&ifp->if_flt_lock);
3004 
3005 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3006 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3007 	    net_check_compatible_if_filter(NULL));
3008 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3009 
3010 	if (dlil_verbose) {
3011 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3012 		    if_filter->iff_name);
3013 	}
3014 	ifnet_decr_iorefcnt(ifp);
3015 
3016 done:
3017 	ifnet_head_done();
3018 	if (retval != 0 && ifp != NULL) {
3019 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3020 		    if_name(ifp), if_filter->iff_name, retval);
3021 	}
3022 	if (retval != 0 && filter != NULL) {
3023 		zfree(dlif_filt_zone, filter);
3024 	}
3025 
3026 	return retval;
3027 }
3028 
3029 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3030 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
3031 {
3032 	int retval = 0;
3033 
3034 	if (detached == 0) {
3035 		ifnet_t ifp = NULL;
3036 
3037 		ifnet_head_lock_shared();
3038 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3039 			interface_filter_t entry = NULL;
3040 
3041 			lck_mtx_lock(&ifp->if_flt_lock);
3042 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3043 				if (entry != filter || entry->filt_skip) {
3044 					continue;
3045 				}
3046 				/*
3047 				 * We've found a match; since it's possible
3048 				 * that the thread gets blocked in the monitor,
3049 				 * we do the lock dance.  Interface should
3050 				 * not be detached since we still have a use
3051 				 * count held during filter attach.
3052 				 */
3053 				entry->filt_skip = 1;   /* skip input/output */
3054 				lck_mtx_unlock(&ifp->if_flt_lock);
3055 				ifnet_head_done();
3056 
3057 				lck_mtx_lock(&ifp->if_flt_lock);
3058 				if_flt_monitor_enter(ifp);
3059 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
3060 				    LCK_MTX_ASSERT_OWNED);
3061 
3062 				/* Remove the filter from the list */
3063 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
3064 				    filt_next);
3065 
3066 				if (dlil_verbose) {
3067 					DLIL_PRINTF("%s: %s filter detached\n",
3068 					    if_name(ifp), filter->filt_name);
3069 				}
3070 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3071 					VERIFY(ifp->if_flt_non_os_count != 0);
3072 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3073 				}
3074 				/*
3075 				 * Decrease filter count and route_generation
3076 				 * ID to let TCP know it should reevalute doing
3077 				 * TSO or not.
3078 				 */
3079 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3080 					ifnet_filter_update_tso(ifp, FALSE);
3081 				}
3082 				if_flt_monitor_leave(ifp);
3083 				lck_mtx_unlock(&ifp->if_flt_lock);
3084 				goto destroy;
3085 			}
3086 			lck_mtx_unlock(&ifp->if_flt_lock);
3087 		}
3088 		ifnet_head_done();
3089 
3090 		/* filter parameter is not a valid filter ref */
3091 		retval = EINVAL;
3092 		goto done;
3093 	} else {
3094 		struct ifnet *ifp = filter->filt_ifp;
3095 		/*
3096 		 * Here we are called from ifnet_detach_final(); the
3097 		 * caller had emptied if_flt_head and we're doing an
3098 		 * implicit filter detach because the interface is
3099 		 * about to go away.  Make sure to adjust the counters
3100 		 * in this case.  We don't need the protection of the
3101 		 * filter monitor since we're called as part of the
3102 		 * final detach in the context of the detacher thread.
3103 		 */
3104 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3105 			VERIFY(ifp->if_flt_non_os_count != 0);
3106 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3107 		}
3108 		/*
3109 		 * Decrease filter count and route_generation
3110 		 * ID to let TCP know it should reevalute doing
3111 		 * TSO or not.
3112 		 */
3113 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3114 			ifnet_filter_update_tso(ifp, FALSE);
3115 		}
3116 	}
3117 
3118 	if (dlil_verbose) {
3119 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3120 	}
3121 
3122 destroy:
3123 
3124 	/* Call the detached function if there is one */
3125 	if (filter->filt_detached) {
3126 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3127 	}
3128 
3129 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3130 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3131 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3132 	}
3133 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3134 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3135 	    net_check_compatible_if_filter(NULL));
3136 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3137 
3138 	/* Free the filter */
3139 	zfree(dlif_filt_zone, filter);
3140 	filter = NULL;
3141 done:
3142 	if (retval != 0 && filter != NULL) {
3143 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3144 		    filter->filt_name, retval);
3145 	}
3146 
3147 	return retval;
3148 }
3149 
3150 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3151 dlil_detach_filter(interface_filter_t filter)
3152 {
3153 	if (filter == NULL) {
3154 		return;
3155 	}
3156 	dlil_detach_filter_internal(filter, 0);
3157 }
3158 
3159 __private_extern__ boolean_t
dlil_has_ip_filter(void)3160 dlil_has_ip_filter(void)
3161 {
3162 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3163 
3164 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3165 
3166 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3167 	return has_filter;
3168 }
3169 
3170 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3171 dlil_has_if_filter(struct ifnet *ifp)
3172 {
3173 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3174 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3175 	return has_filter;
3176 }
3177 
3178 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3179 dlil_input_wakeup(struct dlil_threading_info *inp)
3180 {
3181 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3182 
3183 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3184 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3185 		inp->dlth_wtot++;
3186 		wakeup_one((caddr_t)&inp->dlth_flags);
3187 	}
3188 }
3189 
3190 __attribute__((noreturn))
3191 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3192 dlil_main_input_thread_func(void *v, wait_result_t w)
3193 {
3194 #pragma unused(w)
3195 	struct dlil_threading_info *inp = v;
3196 
3197 	VERIFY(inp == dlil_main_input_thread);
3198 	VERIFY(inp->dlth_ifp == NULL);
3199 	VERIFY(current_thread() == inp->dlth_thread);
3200 
3201 	lck_mtx_lock(&inp->dlth_lock);
3202 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3203 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3204 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3205 	/* wake up once to get out of embryonic state */
3206 	dlil_input_wakeup(inp);
3207 	lck_mtx_unlock(&inp->dlth_lock);
3208 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3209 	/* NOTREACHED */
3210 	__builtin_unreachable();
3211 }
3212 
3213 /*
3214  * Main input thread:
3215  *
3216  *   a) handles all inbound packets for lo0
3217  *   b) handles all inbound packets for interfaces with no dedicated
3218  *	input thread (e.g. anything but Ethernet/PDP or those that support
3219  *	opportunistic polling.)
3220  *   c) protocol registrations
3221  *   d) packet injections
3222  */
3223 __attribute__((noreturn))
3224 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3225 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3226 {
3227 	struct dlil_main_threading_info *inpm = v;
3228 	struct dlil_threading_info *inp = v;
3229 
3230 	/* main input thread is uninterruptible */
3231 	VERIFY(wres != THREAD_INTERRUPTED);
3232 	lck_mtx_lock_spin(&inp->dlth_lock);
3233 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3234 	    DLIL_INPUT_RUNNING)));
3235 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3236 
3237 	while (1) {
3238 		struct mbuf *m = NULL, *m_loop = NULL;
3239 		u_int32_t m_cnt, m_cnt_loop;
3240 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3241 		boolean_t proto_req;
3242 		boolean_t embryonic;
3243 
3244 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3245 
3246 		if (__improbable(embryonic =
3247 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3248 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3249 		}
3250 
3251 		proto_req = (inp->dlth_flags &
3252 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3253 
3254 		/* Packets for non-dedicated interfaces other than lo0 */
3255 		m_cnt = qlen(&inp->dlth_pkts);
3256 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3257 		m = pkt.cp_mbuf;
3258 
3259 		/* Packets exclusive to lo0 */
3260 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3261 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3262 		m_loop = pkt.cp_mbuf;
3263 
3264 		inp->dlth_wtot = 0;
3265 
3266 		lck_mtx_unlock(&inp->dlth_lock);
3267 
3268 		if (__improbable(embryonic)) {
3269 			dlil_decr_pending_thread_count();
3270 		}
3271 
3272 		/*
3273 		 * NOTE warning %%% attention !!!!
3274 		 * We should think about putting some thread starvation
3275 		 * safeguards if we deal with long chains of packets.
3276 		 */
3277 		if (__probable(m_loop != NULL)) {
3278 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3279 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3280 		}
3281 
3282 		if (__probable(m != NULL)) {
3283 			dlil_input_packet_list_extended(NULL, m,
3284 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3285 		}
3286 
3287 		if (__improbable(proto_req)) {
3288 			proto_input_run();
3289 		}
3290 
3291 		lck_mtx_lock_spin(&inp->dlth_lock);
3292 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3293 		/* main input thread cannot be terminated */
3294 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3295 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3296 			break;
3297 		}
3298 	}
3299 
3300 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3301 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3302 	lck_mtx_unlock(&inp->dlth_lock);
3303 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3304 
3305 	VERIFY(0);      /* we should never get here */
3306 	/* NOTREACHED */
3307 	__builtin_unreachable();
3308 }
3309 
3310 /*
3311  * Input thread for interfaces with legacy input model.
3312  */
3313 __attribute__((noreturn))
3314 static void
dlil_input_thread_func(void * v,wait_result_t w)3315 dlil_input_thread_func(void *v, wait_result_t w)
3316 {
3317 #pragma unused(w)
3318 	char thread_name[MAXTHREADNAMESIZE];
3319 	struct dlil_threading_info *inp = v;
3320 	struct ifnet *ifp = inp->dlth_ifp;
3321 
3322 	VERIFY(inp != dlil_main_input_thread);
3323 	VERIFY(ifp != NULL);
3324 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3325 	    !(ifp->if_xflags & IFXF_LEGACY));
3326 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3327 	    !(ifp->if_xflags & IFXF_LEGACY));
3328 	VERIFY(current_thread() == inp->dlth_thread);
3329 
3330 	/* construct the name for this thread, and then apply it */
3331 	bzero(thread_name, sizeof(thread_name));
3332 	(void) snprintf(thread_name, sizeof(thread_name),
3333 	    "dlil_input_%s", ifp->if_xname);
3334 	thread_set_thread_name(inp->dlth_thread, thread_name);
3335 
3336 	lck_mtx_lock(&inp->dlth_lock);
3337 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3338 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3339 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3340 	/* wake up once to get out of embryonic state */
3341 	dlil_input_wakeup(inp);
3342 	lck_mtx_unlock(&inp->dlth_lock);
3343 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3344 	/* NOTREACHED */
3345 	__builtin_unreachable();
3346 }
3347 
3348 __attribute__((noreturn))
3349 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3350 dlil_input_thread_cont(void *v, wait_result_t wres)
3351 {
3352 	struct dlil_threading_info *inp = v;
3353 	struct ifnet *ifp = inp->dlth_ifp;
3354 
3355 	lck_mtx_lock_spin(&inp->dlth_lock);
3356 	if (__improbable(wres == THREAD_INTERRUPTED ||
3357 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3358 		goto terminate;
3359 	}
3360 
3361 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3362 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3363 
3364 	while (1) {
3365 		struct mbuf *m = NULL;
3366 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3367 		boolean_t notify = FALSE;
3368 		boolean_t embryonic;
3369 		u_int32_t m_cnt;
3370 
3371 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3372 
3373 		if (__improbable(embryonic =
3374 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3375 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3376 		}
3377 
3378 		/*
3379 		 * Protocol registration and injection must always use
3380 		 * the main input thread; in theory the latter can utilize
3381 		 * the corresponding input thread where the packet arrived
3382 		 * on, but that requires our knowing the interface in advance
3383 		 * (and the benefits might not worth the trouble.)
3384 		 */
3385 		VERIFY(!(inp->dlth_flags &
3386 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3387 
3388 		/* Packets for this interface */
3389 		m_cnt = qlen(&inp->dlth_pkts);
3390 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3391 		m = pkt.cp_mbuf;
3392 
3393 		inp->dlth_wtot = 0;
3394 
3395 #if SKYWALK
3396 		/*
3397 		 * If this interface is attached to a netif nexus,
3398 		 * the stats are already incremented there; otherwise
3399 		 * do it here.
3400 		 */
3401 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3402 #endif /* SKYWALK */
3403 		notify = dlil_input_stats_sync(ifp, inp);
3404 
3405 		lck_mtx_unlock(&inp->dlth_lock);
3406 
3407 		if (__improbable(embryonic)) {
3408 			ifnet_decr_pending_thread_count(ifp);
3409 		}
3410 
3411 		if (__improbable(notify)) {
3412 			ifnet_notify_data_threshold(ifp);
3413 		}
3414 
3415 		/*
3416 		 * NOTE warning %%% attention !!!!
3417 		 * We should think about putting some thread starvation
3418 		 * safeguards if we deal with long chains of packets.
3419 		 */
3420 		if (__probable(m != NULL)) {
3421 			dlil_input_packet_list_extended(NULL, m,
3422 			    m_cnt, ifp->if_poll_mode);
3423 		}
3424 
3425 		lck_mtx_lock_spin(&inp->dlth_lock);
3426 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3427 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3428 		    DLIL_INPUT_TERMINATE))) {
3429 			break;
3430 		}
3431 	}
3432 
3433 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3434 
3435 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3436 terminate:
3437 		lck_mtx_unlock(&inp->dlth_lock);
3438 		dlil_terminate_input_thread(inp);
3439 		/* NOTREACHED */
3440 	} else {
3441 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3442 		lck_mtx_unlock(&inp->dlth_lock);
3443 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3444 		/* NOTREACHED */
3445 	}
3446 
3447 	VERIFY(0);      /* we should never get here */
3448 	/* NOTREACHED */
3449 	__builtin_unreachable();
3450 }
3451 
3452 /*
3453  * Input thread for interfaces with opportunistic polling input model.
3454  */
3455 __attribute__((noreturn))
3456 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3457 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3458 {
3459 #pragma unused(w)
3460 	char thread_name[MAXTHREADNAMESIZE];
3461 	struct dlil_threading_info *inp = v;
3462 	struct ifnet *ifp = inp->dlth_ifp;
3463 
3464 	VERIFY(inp != dlil_main_input_thread);
3465 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3466 	    (ifp->if_xflags & IFXF_LEGACY));
3467 	VERIFY(current_thread() == inp->dlth_thread);
3468 
3469 	/* construct the name for this thread, and then apply it */
3470 	bzero(thread_name, sizeof(thread_name));
3471 	(void) snprintf(thread_name, sizeof(thread_name),
3472 	    "dlil_input_poll_%s", ifp->if_xname);
3473 	thread_set_thread_name(inp->dlth_thread, thread_name);
3474 
3475 	lck_mtx_lock(&inp->dlth_lock);
3476 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3477 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3478 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3479 	/* wake up once to get out of embryonic state */
3480 	dlil_input_wakeup(inp);
3481 	lck_mtx_unlock(&inp->dlth_lock);
3482 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3483 	/* NOTREACHED */
3484 	__builtin_unreachable();
3485 }
3486 
3487 __attribute__((noreturn))
3488 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3489 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3490 {
3491 	struct dlil_threading_info *inp = v;
3492 	struct ifnet *ifp = inp->dlth_ifp;
3493 	struct timespec ts;
3494 
3495 	lck_mtx_lock_spin(&inp->dlth_lock);
3496 	if (__improbable(wres == THREAD_INTERRUPTED ||
3497 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3498 		goto terminate;
3499 	}
3500 
3501 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3502 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3503 
3504 	while (1) {
3505 		struct mbuf *m = NULL;
3506 		uint32_t m_cnt, poll_req = 0;
3507 		uint64_t m_size = 0;
3508 		ifnet_model_t mode;
3509 		struct timespec now, delta;
3510 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3511 		boolean_t notify;
3512 		boolean_t embryonic;
3513 		uint64_t ival;
3514 
3515 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3516 
3517 		if (__improbable(embryonic =
3518 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3519 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3520 			goto skip;
3521 		}
3522 
3523 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3524 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3525 		}
3526 
3527 		/* Link parameters changed? */
3528 		if (ifp->if_poll_update != 0) {
3529 			ifp->if_poll_update = 0;
3530 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3531 		}
3532 
3533 		/* Current operating mode */
3534 		mode = ifp->if_poll_mode;
3535 
3536 		/*
3537 		 * Protocol registration and injection must always use
3538 		 * the main input thread; in theory the latter can utilize
3539 		 * the corresponding input thread where the packet arrived
3540 		 * on, but that requires our knowing the interface in advance
3541 		 * (and the benefits might not worth the trouble.)
3542 		 */
3543 		VERIFY(!(inp->dlth_flags &
3544 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3545 
3546 		/* Total count of all packets */
3547 		m_cnt = qlen(&inp->dlth_pkts);
3548 
3549 		/* Total bytes of all packets */
3550 		m_size = qsize(&inp->dlth_pkts);
3551 
3552 		/* Packets for this interface */
3553 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3554 		m = pkt.cp_mbuf;
3555 		VERIFY(m != NULL || m_cnt == 0);
3556 
3557 		nanouptime(&now);
3558 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3559 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3560 		}
3561 
3562 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3563 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3564 			u_int32_t ptot, btot;
3565 
3566 			/* Accumulate statistics for current sampling */
3567 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3568 
3569 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3570 				goto skip;
3571 			}
3572 
3573 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3574 
3575 			/* Calculate min/max of inbound bytes */
3576 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3577 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3578 				ifp->if_rxpoll_bmin = btot;
3579 			}
3580 			if (btot > ifp->if_rxpoll_bmax) {
3581 				ifp->if_rxpoll_bmax = btot;
3582 			}
3583 
3584 			/* Calculate EWMA of inbound bytes */
3585 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3586 
3587 			/* Calculate min/max of inbound packets */
3588 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3589 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3590 				ifp->if_rxpoll_pmin = ptot;
3591 			}
3592 			if (ptot > ifp->if_rxpoll_pmax) {
3593 				ifp->if_rxpoll_pmax = ptot;
3594 			}
3595 
3596 			/* Calculate EWMA of inbound packets */
3597 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3598 
3599 			/* Reset sampling statistics */
3600 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3601 
3602 			/* Calculate EWMA of wakeup requests */
3603 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3604 			    if_rxpoll_decay);
3605 			inp->dlth_wtot = 0;
3606 
3607 			if (dlil_verbose) {
3608 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3609 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3610 				}
3611 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3612 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3613 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3614 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3615 					    "limits [%d/%d], wreq avg %d "
3616 					    "limits [%d/%d], bytes avg %d "
3617 					    "limits [%d/%d]\n", if_name(ifp),
3618 					    (ifp->if_poll_mode ==
3619 					    IFNET_MODEL_INPUT_POLL_ON) ?
3620 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3621 					    ifp->if_rxpoll_pmax,
3622 					    ifp->if_rxpoll_plowat,
3623 					    ifp->if_rxpoll_phiwat,
3624 					    ifp->if_rxpoll_wavg,
3625 					    ifp->if_rxpoll_wlowat,
3626 					    ifp->if_rxpoll_whiwat,
3627 					    ifp->if_rxpoll_bavg,
3628 					    ifp->if_rxpoll_blowat,
3629 					    ifp->if_rxpoll_bhiwat);
3630 				}
3631 			}
3632 
3633 			/* Perform mode transition, if necessary */
3634 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3635 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3636 			}
3637 
3638 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3639 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3640 				goto skip;
3641 			}
3642 
3643 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3644 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3645 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3646 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3647 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3648 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3649 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3650 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3651 				mode = IFNET_MODEL_INPUT_POLL_ON;
3652 			}
3653 
3654 			if (mode != ifp->if_poll_mode) {
3655 				ifp->if_poll_mode = mode;
3656 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3657 				poll_req++;
3658 			}
3659 		}
3660 skip:
3661 		notify = dlil_input_stats_sync(ifp, inp);
3662 
3663 		lck_mtx_unlock(&inp->dlth_lock);
3664 
3665 		if (__improbable(embryonic)) {
3666 			ifnet_decr_pending_thread_count(ifp);
3667 		}
3668 
3669 		if (__improbable(notify)) {
3670 			ifnet_notify_data_threshold(ifp);
3671 		}
3672 
3673 		/*
3674 		 * If there's a mode change and interface is still attached,
3675 		 * perform a downcall to the driver for the new mode.  Also
3676 		 * hold an IO refcnt on the interface to prevent it from
3677 		 * being detached (will be release below.)
3678 		 */
3679 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3680 			struct ifnet_model_params p = {
3681 				.model = mode, .reserved = { 0 }
3682 			};
3683 			errno_t err;
3684 
3685 			if (dlil_verbose) {
3686 				DLIL_PRINTF("%s: polling is now %s, "
3687 				    "pkts avg %d max %d limits [%d/%d], "
3688 				    "wreq avg %d limits [%d/%d], "
3689 				    "bytes avg %d limits [%d/%d]\n",
3690 				    if_name(ifp),
3691 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3692 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3693 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3694 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3695 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3696 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3697 				    ifp->if_rxpoll_bhiwat);
3698 			}
3699 
3700 			if ((err = ((*ifp->if_input_ctl)(ifp,
3701 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3702 				DLIL_PRINTF("%s: error setting polling mode "
3703 				    "to %s (%d)\n", if_name(ifp),
3704 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3705 				    "ON" : "OFF", err);
3706 			}
3707 
3708 			switch (mode) {
3709 			case IFNET_MODEL_INPUT_POLL_OFF:
3710 				ifnet_set_poll_cycle(ifp, NULL);
3711 				ifp->if_rxpoll_offreq++;
3712 				if (err != 0) {
3713 					ifp->if_rxpoll_offerr++;
3714 				}
3715 				break;
3716 
3717 			case IFNET_MODEL_INPUT_POLL_ON:
3718 				net_nsectimer(&ival, &ts);
3719 				ifnet_set_poll_cycle(ifp, &ts);
3720 				ifnet_poll(ifp);
3721 				ifp->if_rxpoll_onreq++;
3722 				if (err != 0) {
3723 					ifp->if_rxpoll_onerr++;
3724 				}
3725 				break;
3726 
3727 			default:
3728 				VERIFY(0);
3729 				/* NOTREACHED */
3730 			}
3731 
3732 			/* Release the IO refcnt */
3733 			ifnet_decr_iorefcnt(ifp);
3734 		}
3735 
3736 		/*
3737 		 * NOTE warning %%% attention !!!!
3738 		 * We should think about putting some thread starvation
3739 		 * safeguards if we deal with long chains of packets.
3740 		 */
3741 		if (__probable(m != NULL)) {
3742 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3743 		}
3744 
3745 		lck_mtx_lock_spin(&inp->dlth_lock);
3746 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3747 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3748 		    DLIL_INPUT_TERMINATE))) {
3749 			break;
3750 		}
3751 	}
3752 
3753 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3754 
3755 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3756 terminate:
3757 		lck_mtx_unlock(&inp->dlth_lock);
3758 		dlil_terminate_input_thread(inp);
3759 		/* NOTREACHED */
3760 	} else {
3761 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3762 		lck_mtx_unlock(&inp->dlth_lock);
3763 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3764 		    inp);
3765 		/* NOTREACHED */
3766 	}
3767 
3768 	VERIFY(0);      /* we should never get here */
3769 	/* NOTREACHED */
3770 	__builtin_unreachable();
3771 }
3772 
3773 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3774 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3775 {
3776 	if (p != NULL) {
3777 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3778 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3779 			return EINVAL;
3780 		}
3781 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3782 		    p->packets_lowat >= p->packets_hiwat) {
3783 			return EINVAL;
3784 		}
3785 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3786 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3787 			return EINVAL;
3788 		}
3789 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3790 		    p->bytes_lowat >= p->bytes_hiwat) {
3791 			return EINVAL;
3792 		}
3793 		if (p->interval_time != 0 &&
3794 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3795 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3796 		}
3797 	}
3798 	return 0;
3799 }
3800 
3801 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3802 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3803 {
3804 	u_int64_t sample_holdtime, inbw;
3805 
3806 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3807 		sample_holdtime = 0;    /* polling is disabled */
3808 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3809 		    ifp->if_rxpoll_blowat = 0;
3810 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3811 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3812 		ifp->if_rxpoll_plim = 0;
3813 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3814 	} else {
3815 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3816 		u_int64_t ival;
3817 		unsigned int n, i;
3818 
3819 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3820 			if (inbw < rxpoll_tbl[i].speed) {
3821 				break;
3822 			}
3823 			n = i;
3824 		}
3825 		/* auto-tune if caller didn't specify a value */
3826 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3827 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3828 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3829 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3830 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3831 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3832 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3833 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3834 		plim = ((p == NULL || p->packets_limit == 0 ||
3835 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3836 		ival = ((p == NULL || p->interval_time == 0 ||
3837 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3838 		    if_rxpoll_interval_time : p->interval_time);
3839 
3840 		VERIFY(plowat != 0 && phiwat != 0);
3841 		VERIFY(blowat != 0 && bhiwat != 0);
3842 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3843 
3844 		sample_holdtime = if_rxpoll_sample_holdtime;
3845 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3846 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3847 		ifp->if_rxpoll_plowat = plowat;
3848 		ifp->if_rxpoll_phiwat = phiwat;
3849 		ifp->if_rxpoll_blowat = blowat;
3850 		ifp->if_rxpoll_bhiwat = bhiwat;
3851 		ifp->if_rxpoll_plim = plim;
3852 		ifp->if_rxpoll_ival = ival;
3853 	}
3854 
3855 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3856 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3857 
3858 	if (dlil_verbose) {
3859 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3860 		    "poll interval %llu nsec, pkts per poll %u, "
3861 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3862 		    "bytes limits [%u/%u]\n", if_name(ifp),
3863 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3864 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3865 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3866 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3867 		    ifp->if_rxpoll_bhiwat);
3868 	}
3869 }
3870 
3871 /*
3872  * Must be called on an attached ifnet (caller is expected to check.)
3873  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3874  */
3875 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3876 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3877     boolean_t locked)
3878 {
3879 	errno_t err;
3880 	struct dlil_threading_info *inp;
3881 
3882 	VERIFY(ifp != NULL);
3883 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3884 		return ENXIO;
3885 	}
3886 	err = dlil_rxpoll_validate_params(p);
3887 	if (err != 0) {
3888 		return err;
3889 	}
3890 
3891 	if (!locked) {
3892 		lck_mtx_lock(&inp->dlth_lock);
3893 	}
3894 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3895 	/*
3896 	 * Normally, we'd reset the parameters to the auto-tuned values
3897 	 * if the the input thread detects a change in link rate.  If the
3898 	 * driver provides its own parameters right after a link rate
3899 	 * changes, but before the input thread gets to run, we want to
3900 	 * make sure to keep the driver's values.  Clearing if_poll_update
3901 	 * will achieve that.
3902 	 */
3903 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3904 		ifp->if_poll_update = 0;
3905 	}
3906 	dlil_rxpoll_update_params(ifp, p);
3907 	if (!locked) {
3908 		lck_mtx_unlock(&inp->dlth_lock);
3909 	}
3910 	return 0;
3911 }
3912 
3913 /*
3914  * Must be called on an attached ifnet (caller is expected to check.)
3915  */
3916 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3917 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3918 {
3919 	struct dlil_threading_info *inp;
3920 
3921 	VERIFY(ifp != NULL && p != NULL);
3922 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3923 		return ENXIO;
3924 	}
3925 
3926 	bzero(p, sizeof(*p));
3927 
3928 	lck_mtx_lock(&inp->dlth_lock);
3929 	p->packets_limit = ifp->if_rxpoll_plim;
3930 	p->packets_lowat = ifp->if_rxpoll_plowat;
3931 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3932 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3933 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3934 	p->interval_time = ifp->if_rxpoll_ival;
3935 	lck_mtx_unlock(&inp->dlth_lock);
3936 
3937 	return 0;
3938 }
3939 
3940 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3941 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3942     const struct ifnet_stat_increment_param *s)
3943 {
3944 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3945 }
3946 
3947 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3948 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3949     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3950 {
3951 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3952 }
3953 
3954 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3955 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3956     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3957 {
3958 	return ifnet_input_common(ifp, m_head, m_tail, s,
3959 	           (m_head != NULL), TRUE);
3960 }
3961 
3962 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3963 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3964     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3965 {
3966 	dlil_input_func input_func;
3967 	struct ifnet_stat_increment_param _s;
3968 	u_int32_t m_cnt = 0, m_size = 0;
3969 	struct mbuf *last;
3970 	errno_t err = 0;
3971 
3972 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3973 		if (m_head != NULL) {
3974 			mbuf_freem_list(m_head);
3975 		}
3976 		return EINVAL;
3977 	}
3978 
3979 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3980 	VERIFY(m_tail == NULL || ext);
3981 	VERIFY(s != NULL || !ext);
3982 
3983 	/*
3984 	 * Drop the packet(s) if the parameters are invalid, or if the
3985 	 * interface is no longer attached; else hold an IO refcnt to
3986 	 * prevent it from being detached (will be released below.)
3987 	 */
3988 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3989 		if (m_head != NULL) {
3990 			mbuf_freem_list(m_head);
3991 		}
3992 		return EINVAL;
3993 	}
3994 
3995 	input_func = ifp->if_input_dlil;
3996 	VERIFY(input_func != NULL);
3997 
3998 	if (m_tail == NULL) {
3999 		last = m_head;
4000 		while (m_head != NULL) {
4001 #if IFNET_INPUT_SANITY_CHK
4002 			if (__improbable(dlil_input_sanity_check != 0)) {
4003 				DLIL_INPUT_CHECK(last, ifp);
4004 			}
4005 #endif /* IFNET_INPUT_SANITY_CHK */
4006 			m_cnt++;
4007 			m_size += m_length(last);
4008 			if (mbuf_nextpkt(last) == NULL) {
4009 				break;
4010 			}
4011 			last = mbuf_nextpkt(last);
4012 		}
4013 		m_tail = last;
4014 	} else {
4015 #if IFNET_INPUT_SANITY_CHK
4016 		if (__improbable(dlil_input_sanity_check != 0)) {
4017 			last = m_head;
4018 			while (1) {
4019 				DLIL_INPUT_CHECK(last, ifp);
4020 				m_cnt++;
4021 				m_size += m_length(last);
4022 				if (mbuf_nextpkt(last) == NULL) {
4023 					break;
4024 				}
4025 				last = mbuf_nextpkt(last);
4026 			}
4027 		} else {
4028 			m_cnt = s->packets_in;
4029 			m_size = s->bytes_in;
4030 			last = m_tail;
4031 		}
4032 #else
4033 		m_cnt = s->packets_in;
4034 		m_size = s->bytes_in;
4035 		last = m_tail;
4036 #endif /* IFNET_INPUT_SANITY_CHK */
4037 	}
4038 
4039 	if (last != m_tail) {
4040 		panic_plain("%s: invalid input packet chain for %s, "
4041 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4042 		    m_tail, last);
4043 	}
4044 
4045 	/*
4046 	 * Assert packet count only for the extended variant, for backwards
4047 	 * compatibility, since this came directly from the device driver.
4048 	 * Relax this assertion for input bytes, as the driver may have
4049 	 * included the link-layer headers in the computation; hence
4050 	 * m_size is just an approximation.
4051 	 */
4052 	if (ext && s->packets_in != m_cnt) {
4053 		panic_plain("%s: input packet count mismatch for %s, "
4054 		    "%d instead of %d\n", __func__, if_name(ifp),
4055 		    s->packets_in, m_cnt);
4056 	}
4057 
4058 	if (s == NULL) {
4059 		bzero(&_s, sizeof(_s));
4060 		s = &_s;
4061 	} else {
4062 		_s = *s;
4063 	}
4064 	_s.packets_in = m_cnt;
4065 	_s.bytes_in = m_size;
4066 
4067 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4068 
4069 	if (ifp != lo_ifp) {
4070 		/* Release the IO refcnt */
4071 		ifnet_datamov_end(ifp);
4072 	}
4073 
4074 	return err;
4075 }
4076 
4077 #if SKYWALK
4078 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4079 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4080 {
4081 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4082 	           ptrauth_nop_cast(void *, &dlil_input_handler),
4083 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4084 }
4085 
4086 void
dlil_reset_input_handler(struct ifnet * ifp)4087 dlil_reset_input_handler(struct ifnet *ifp)
4088 {
4089 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4090 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
4091 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4092 		;
4093 	}
4094 }
4095 
4096 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4097 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4098 {
4099 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4100 	           ptrauth_nop_cast(void *, &dlil_output_handler),
4101 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4102 }
4103 
4104 void
dlil_reset_output_handler(struct ifnet * ifp)4105 dlil_reset_output_handler(struct ifnet *ifp)
4106 {
4107 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4108 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
4109 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4110 		;
4111 	}
4112 }
4113 #endif /* SKYWALK */
4114 
4115 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4116 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4117 {
4118 	return ifp->if_output(ifp, m);
4119 }
4120 
4121 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4122 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4123     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4124     boolean_t poll, struct thread *tp)
4125 {
4126 	struct dlil_threading_info *inp = ifp->if_inp;
4127 
4128 	if (__improbable(inp == NULL)) {
4129 		inp = dlil_main_input_thread;
4130 	}
4131 
4132 #if (DEVELOPMENT || DEBUG)
4133 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4134 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4135 	} else
4136 #endif /* (DEVELOPMENT || DEBUG) */
4137 	{
4138 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4139 	}
4140 }
4141 
4142 /*
4143  * Detect whether a queue contains a burst that needs to be trimmed.
4144  */
4145 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
4146 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
4147 	                        qtype(q) == QP_MBUF)
4148 
4149 #define MAX_KNOWN_MBUF_CLASS 8
4150 
4151 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4152 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4153     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4154 {
4155 	uint32_t overcommitted_qlen;    /* Length in packets. */
4156 	uint64_t overcommitted_qsize;   /* Size in bytes. */
4157 	uint32_t target_qlen;                   /* The desired queue length after trimming. */
4158 	uint32_t pkts_to_drop;                  /* Number of packets to drop. */
4159 	uint32_t dropped_pkts = 0;              /* Number of packets that were dropped. */
4160 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
4161 	struct mbuf *m = NULL, *m_tmp = NULL;
4162 
4163 	overcommitted_qlen = qlen(input_queue);
4164 	overcommitted_qsize = qsize(input_queue);
4165 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4166 
4167 	if (overcommitted_qlen <= target_qlen) {
4168 		/*
4169 		 * The queue is already within the target limits.
4170 		 */
4171 		dropped_pkts = 0;
4172 		goto out;
4173 	}
4174 
4175 	pkts_to_drop = overcommitted_qlen - target_qlen;
4176 
4177 	/*
4178 	 * Proceed to removing packets from the head of the queue,
4179 	 * starting from the oldest, until the desired number of packets
4180 	 * has been dropped.
4181 	 */
4182 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4183 		if (pkts_to_drop <= dropped_pkts) {
4184 			break;
4185 		}
4186 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
4187 		MBUFQ_NEXT(m) = NULL;
4188 		MBUFQ_ENQUEUE(freeq, m);
4189 
4190 		dropped_pkts += 1;
4191 		dropped_bytes += m_length(m);
4192 	}
4193 
4194 	/*
4195 	 * Adjust the length and the estimated size of the queue
4196 	 * after trimming.
4197 	 */
4198 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4199 	qlen(input_queue) = target_qlen;
4200 
4201 	/* qsize() is an approximation. */
4202 	if (dropped_bytes < qsize(input_queue)) {
4203 		qsize(input_queue) -= dropped_bytes;
4204 	} else {
4205 		qsize(input_queue) = 0;
4206 	}
4207 
4208 	/*
4209 	 * Adjust the ifnet statistics increments, if needed.
4210 	 */
4211 	stat_delta->dropped += dropped_pkts;
4212 	if (dropped_pkts < stat_delta->packets_in) {
4213 		stat_delta->packets_in -= dropped_pkts;
4214 	} else {
4215 		stat_delta->packets_in = 0;
4216 	}
4217 	if (dropped_bytes < stat_delta->bytes_in) {
4218 		stat_delta->bytes_in -= dropped_bytes;
4219 	} else {
4220 		stat_delta->bytes_in = 0;
4221 	}
4222 
4223 out:
4224 	if (dlil_verbose) {
4225 		/*
4226 		 * The basic information about the drop is logged
4227 		 * by the invoking function (dlil_input_{,a}sync).
4228 		 * If `dlil_verbose' flag is set, provide more information
4229 		 * that can be useful for debugging.
4230 		 */
4231 		DLIL_PRINTF("%s: "
4232 		    "qlen: %u -> %u, "
4233 		    "qsize: %llu -> %llu "
4234 		    "qlimit: %u (sysctl: %u) "
4235 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4236 		    "dropped_pkts: %u dropped_bytes %u\n",
4237 		    __func__,
4238 		    overcommitted_qlen, qlen(input_queue),
4239 		    overcommitted_qsize, qsize(input_queue),
4240 		    qlimit(input_queue), if_rcvq_burst_limit,
4241 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4242 		    dropped_pkts, dropped_bytes);
4243 	}
4244 
4245 	return dropped_pkts;
4246 }
4247 
4248 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4249 dlil_input_async(struct dlil_threading_info *inp,
4250     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4251     const struct ifnet_stat_increment_param *s, boolean_t poll,
4252     struct thread *tp)
4253 {
4254 	u_int32_t m_cnt = s->packets_in;
4255 	u_int32_t m_size = s->bytes_in;
4256 	boolean_t notify = FALSE;
4257 	struct ifnet_stat_increment_param s_adj = *s;
4258 	dlil_freeq_t freeq;
4259 	MBUFQ_INIT(&freeq);
4260 
4261 	/*
4262 	 * If there is a matching DLIL input thread associated with an
4263 	 * affinity set, associate this thread with the same set.  We
4264 	 * will only do this once.
4265 	 */
4266 	lck_mtx_lock_spin(&inp->dlth_lock);
4267 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4268 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4269 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4270 		u_int32_t tag = inp->dlth_affinity_tag;
4271 
4272 		if (poll) {
4273 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4274 			inp->dlth_poller_thread = tp;
4275 		} else {
4276 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4277 			inp->dlth_driver_thread = tp;
4278 		}
4279 		lck_mtx_unlock(&inp->dlth_lock);
4280 
4281 		/* Associate the current thread with the new affinity tag */
4282 		(void) dlil_affinity_set(tp, tag);
4283 
4284 		/*
4285 		 * Take a reference on the current thread; during detach,
4286 		 * we will need to refer to it in order to tear down its
4287 		 * affinity.
4288 		 */
4289 		thread_reference(tp);
4290 		lck_mtx_lock_spin(&inp->dlth_lock);
4291 	}
4292 
4293 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4294 
4295 	/*
4296 	 * Because of loopbacked multicast we cannot stuff the ifp in
4297 	 * the rcvif of the packet header: loopback (lo0) packets use a
4298 	 * dedicated list so that we can later associate them with lo_ifp
4299 	 * on their way up the stack.  Packets for other interfaces without
4300 	 * dedicated input threads go to the regular list.
4301 	 */
4302 	if (m_head != NULL) {
4303 		classq_pkt_t head, tail;
4304 		class_queue_t *input_queue;
4305 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4306 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4307 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4308 			struct dlil_main_threading_info *inpm =
4309 			    (struct dlil_main_threading_info *)inp;
4310 			input_queue = &inpm->lo_rcvq_pkts;
4311 		} else {
4312 			input_queue = &inp->dlth_pkts;
4313 		}
4314 
4315 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4316 
4317 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4318 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4319 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
4320 			inp->dlth_trim_cnt += 1;
4321 
4322 			os_log_error(OS_LOG_DEFAULT,
4323 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
4324 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
4325 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4326 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4327 			    qlen(input_queue));
4328 		}
4329 	}
4330 
4331 #if IFNET_INPUT_SANITY_CHK
4332 	/*
4333 	 * Verify that the original stat increment parameter
4334 	 * accurately describes the input chain `m_head`.
4335 	 * This is not affected by the trimming of input queue.
4336 	 */
4337 	if (__improbable(dlil_input_sanity_check != 0)) {
4338 		u_int32_t count = 0, size = 0;
4339 		struct mbuf *m0;
4340 
4341 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4342 			size += m_length(m0);
4343 			count++;
4344 		}
4345 
4346 		if (count != m_cnt) {
4347 			panic_plain("%s: invalid total packet count %u "
4348 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4349 			/* NOTREACHED */
4350 			__builtin_unreachable();
4351 		} else if (size != m_size) {
4352 			panic_plain("%s: invalid total packet size %u "
4353 			    "(expected %u)\n", if_name(ifp), size, m_size);
4354 			/* NOTREACHED */
4355 			__builtin_unreachable();
4356 		}
4357 
4358 		inp->dlth_pkts_cnt += m_cnt;
4359 	}
4360 #endif /* IFNET_INPUT_SANITY_CHK */
4361 
4362 	/* NOTE: use the adjusted parameter, vs the original one */
4363 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4364 	/*
4365 	 * If we're using the main input thread, synchronize the
4366 	 * stats now since we have the interface context.  All
4367 	 * other cases involving dedicated input threads will
4368 	 * have their stats synchronized there.
4369 	 */
4370 	if (inp == dlil_main_input_thread) {
4371 		notify = dlil_input_stats_sync(ifp, inp);
4372 	}
4373 
4374 	dlil_input_wakeup(inp);
4375 	lck_mtx_unlock(&inp->dlth_lock);
4376 
4377 	/*
4378 	 * Actual freeing of the excess packets must happen
4379 	 * after the dlth_lock had been released.
4380 	 */
4381 	if (!MBUFQ_EMPTY(&freeq)) {
4382 		m_freem_list(MBUFQ_FIRST(&freeq));
4383 	}
4384 
4385 	if (notify) {
4386 		ifnet_notify_data_threshold(ifp);
4387 	}
4388 
4389 	return 0;
4390 }
4391 
4392 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4393 dlil_input_sync(struct dlil_threading_info *inp,
4394     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4395     const struct ifnet_stat_increment_param *s, boolean_t poll,
4396     struct thread *tp)
4397 {
4398 #pragma unused(tp)
4399 	u_int32_t m_cnt = s->packets_in;
4400 	u_int32_t m_size = s->bytes_in;
4401 	boolean_t notify = FALSE;
4402 	classq_pkt_t head, tail;
4403 	struct ifnet_stat_increment_param s_adj = *s;
4404 	dlil_freeq_t freeq;
4405 	MBUFQ_INIT(&freeq);
4406 
4407 	ASSERT(inp != dlil_main_input_thread);
4408 
4409 	/* XXX: should we just assert instead? */
4410 	if (__improbable(m_head == NULL)) {
4411 		return 0;
4412 	}
4413 
4414 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4415 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4416 
4417 	lck_mtx_lock_spin(&inp->dlth_lock);
4418 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4419 
4420 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4421 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4422 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4423 		inp->dlth_trim_cnt += 1;
4424 
4425 		os_log_error(OS_LOG_DEFAULT,
4426 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4427 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4428 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4429 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4430 		    qlen(&inp->dlth_pkts));
4431 	}
4432 
4433 #if IFNET_INPUT_SANITY_CHK
4434 	if (__improbable(dlil_input_sanity_check != 0)) {
4435 		u_int32_t count = 0, size = 0;
4436 		struct mbuf *m0;
4437 
4438 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4439 			size += m_length(m0);
4440 			count++;
4441 		}
4442 
4443 		if (count != m_cnt) {
4444 			panic_plain("%s: invalid total packet count %u "
4445 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4446 			/* NOTREACHED */
4447 			__builtin_unreachable();
4448 		} else if (size != m_size) {
4449 			panic_plain("%s: invalid total packet size %u "
4450 			    "(expected %u)\n", if_name(ifp), size, m_size);
4451 			/* NOTREACHED */
4452 			__builtin_unreachable();
4453 		}
4454 
4455 		inp->dlth_pkts_cnt += m_cnt;
4456 	}
4457 #endif /* IFNET_INPUT_SANITY_CHK */
4458 
4459 	/* NOTE: use the adjusted parameter, vs the original one */
4460 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4461 
4462 	m_cnt = qlen(&inp->dlth_pkts);
4463 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4464 
4465 #if SKYWALK
4466 	/*
4467 	 * If this interface is attached to a netif nexus,
4468 	 * the stats are already incremented there; otherwise
4469 	 * do it here.
4470 	 */
4471 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4472 #endif /* SKYWALK */
4473 	notify = dlil_input_stats_sync(ifp, inp);
4474 
4475 	lck_mtx_unlock(&inp->dlth_lock);
4476 
4477 	/*
4478 	 * Actual freeing of the excess packets must happen
4479 	 * after the dlth_lock had been released.
4480 	 */
4481 	if (!MBUFQ_EMPTY(&freeq)) {
4482 		m_freem_list(MBUFQ_FIRST(&freeq));
4483 	}
4484 
4485 	if (notify) {
4486 		ifnet_notify_data_threshold(ifp);
4487 	}
4488 
4489 	/*
4490 	 * NOTE warning %%% attention !!!!
4491 	 * We should think about putting some thread starvation
4492 	 * safeguards if we deal with long chains of packets.
4493 	 */
4494 	if (head.cp_mbuf != NULL) {
4495 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4496 		    m_cnt, ifp->if_poll_mode);
4497 	}
4498 
4499 	return 0;
4500 }
4501 
4502 #if SKYWALK
4503 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4504 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4505 {
4506 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4507 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4508 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4509 }
4510 
4511 void
ifnet_reset_output_handler(struct ifnet * ifp)4512 ifnet_reset_output_handler(struct ifnet *ifp)
4513 {
4514 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4515 	    ptrauth_nop_cast(void *, ifp->if_output),
4516 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4517 		;
4518 	}
4519 }
4520 
4521 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4522 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4523 {
4524 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4525 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4526 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4527 }
4528 
4529 void
ifnet_reset_start_handler(struct ifnet * ifp)4530 ifnet_reset_start_handler(struct ifnet *ifp)
4531 {
4532 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4533 	    ptrauth_nop_cast(void *, ifp->if_start),
4534 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4535 		;
4536 	}
4537 }
4538 #endif /* SKYWALK */
4539 
4540 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4541 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4542 {
4543 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4544 		return;
4545 	}
4546 	/*
4547 	 * If the starter thread is inactive, signal it to do work,
4548 	 * unless the interface is being flow controlled from below,
4549 	 * e.g. a virtual interface being flow controlled by a real
4550 	 * network interface beneath it, or it's been disabled via
4551 	 * a call to ifnet_disable_output().
4552 	 */
4553 	lck_mtx_lock_spin(&ifp->if_start_lock);
4554 	if (ignore_delay) {
4555 		ifp->if_start_flags |= IFSF_NO_DELAY;
4556 	}
4557 	if (resetfc) {
4558 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4559 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4560 		lck_mtx_unlock(&ifp->if_start_lock);
4561 		return;
4562 	}
4563 	ifp->if_start_req++;
4564 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4565 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4566 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4567 	    ifp->if_start_delayed == 0)) {
4568 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4569 	}
4570 	lck_mtx_unlock(&ifp->if_start_lock);
4571 }
4572 
4573 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4574 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4575 {
4576 	ifp->if_start_pacemaker_time = tx_time;
4577 }
4578 
4579 void
ifnet_start(struct ifnet * ifp)4580 ifnet_start(struct ifnet *ifp)
4581 {
4582 	ifnet_start_common(ifp, FALSE, FALSE);
4583 }
4584 
4585 void
ifnet_start_ignore_delay(struct ifnet * ifp)4586 ifnet_start_ignore_delay(struct ifnet *ifp)
4587 {
4588 	ifnet_start_common(ifp, FALSE, TRUE);
4589 }
4590 
4591 __attribute__((noreturn))
4592 static void
ifnet_start_thread_func(void * v,wait_result_t w)4593 ifnet_start_thread_func(void *v, wait_result_t w)
4594 {
4595 #pragma unused(w)
4596 	struct ifnet *ifp = v;
4597 	char thread_name[MAXTHREADNAMESIZE];
4598 
4599 	/* Construct the name for this thread, and then apply it. */
4600 	bzero(thread_name, sizeof(thread_name));
4601 	(void) snprintf(thread_name, sizeof(thread_name),
4602 	    "ifnet_start_%s", ifp->if_xname);
4603 #if SKYWALK
4604 	/* override name for native Skywalk interface */
4605 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4606 		(void) snprintf(thread_name, sizeof(thread_name),
4607 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4608 	}
4609 #endif /* SKYWALK */
4610 	ASSERT(ifp->if_start_thread == current_thread());
4611 	thread_set_thread_name(current_thread(), thread_name);
4612 
4613 	/*
4614 	 * Treat the dedicated starter thread for lo0 as equivalent to
4615 	 * the driver workloop thread; if net_affinity is enabled for
4616 	 * the main input thread, associate this starter thread to it
4617 	 * by binding them with the same affinity tag.  This is done
4618 	 * only once (as we only have one lo_ifp which never goes away.)
4619 	 */
4620 	if (ifp == lo_ifp) {
4621 		struct dlil_threading_info *inp = dlil_main_input_thread;
4622 		struct thread *tp = current_thread();
4623 #if SKYWALK
4624 		/* native skywalk loopback not yet implemented */
4625 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4626 #endif /* SKYWALK */
4627 
4628 		lck_mtx_lock(&inp->dlth_lock);
4629 		if (inp->dlth_affinity) {
4630 			u_int32_t tag = inp->dlth_affinity_tag;
4631 
4632 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4633 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4634 			inp->dlth_driver_thread = tp;
4635 			lck_mtx_unlock(&inp->dlth_lock);
4636 
4637 			/* Associate this thread with the affinity tag */
4638 			(void) dlil_affinity_set(tp, tag);
4639 		} else {
4640 			lck_mtx_unlock(&inp->dlth_lock);
4641 		}
4642 	}
4643 
4644 	lck_mtx_lock(&ifp->if_start_lock);
4645 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4646 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4647 	ifp->if_start_embryonic = 1;
4648 	/* wake up once to get out of embryonic state */
4649 	ifp->if_start_req++;
4650 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4651 	lck_mtx_unlock(&ifp->if_start_lock);
4652 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4653 	/* NOTREACHED */
4654 	__builtin_unreachable();
4655 }
4656 
4657 __attribute__((noreturn))
4658 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4659 ifnet_start_thread_cont(void *v, wait_result_t wres)
4660 {
4661 	struct ifnet *ifp = v;
4662 	struct ifclassq *ifq = ifp->if_snd;
4663 
4664 	lck_mtx_lock_spin(&ifp->if_start_lock);
4665 	if (__improbable(wres == THREAD_INTERRUPTED ||
4666 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4667 		goto terminate;
4668 	}
4669 
4670 	if (__improbable(ifp->if_start_embryonic)) {
4671 		ifp->if_start_embryonic = 0;
4672 		lck_mtx_unlock(&ifp->if_start_lock);
4673 		ifnet_decr_pending_thread_count(ifp);
4674 		lck_mtx_lock_spin(&ifp->if_start_lock);
4675 		goto skip;
4676 	}
4677 
4678 	ifp->if_start_active = 1;
4679 
4680 	/*
4681 	 * Keep on servicing until no more request.
4682 	 */
4683 	for (;;) {
4684 		u_int32_t req = ifp->if_start_req;
4685 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4686 		    !IFCQ_IS_EMPTY(ifq) &&
4687 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4688 		    ifp->if_start_delayed == 0 &&
4689 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4690 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4691 			ifp->if_start_delayed = 1;
4692 			ifnet_start_delayed++;
4693 			break;
4694 		}
4695 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4696 		ifp->if_start_delayed = 0;
4697 		lck_mtx_unlock(&ifp->if_start_lock);
4698 
4699 		/*
4700 		 * If no longer attached, don't call start because ifp
4701 		 * is being destroyed; else hold an IO refcnt to
4702 		 * prevent the interface from being detached (will be
4703 		 * released below.)
4704 		 */
4705 		if (!ifnet_datamov_begin(ifp)) {
4706 			lck_mtx_lock_spin(&ifp->if_start_lock);
4707 			break;
4708 		}
4709 
4710 		/* invoke the driver's start routine */
4711 		((*ifp->if_start)(ifp));
4712 
4713 		/*
4714 		 * Release the io ref count taken above.
4715 		 */
4716 		ifnet_datamov_end(ifp);
4717 
4718 		lck_mtx_lock_spin(&ifp->if_start_lock);
4719 
4720 		/*
4721 		 * If there's no pending request or if the
4722 		 * interface has been disabled, we're done.
4723 		 */
4724 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4725 		if (req == ifp->if_start_req ||
4726 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4727 			break;
4728 		}
4729 	}
4730 skip:
4731 	ifp->if_start_req = 0;
4732 	ifp->if_start_active = 0;
4733 
4734 #if SKYWALK
4735 	/*
4736 	 * Wakeup any waiters, e.g. any threads waiting to
4737 	 * detach the interface from the flowswitch, etc.
4738 	 */
4739 	if (ifp->if_start_waiters != 0) {
4740 		ifp->if_start_waiters = 0;
4741 		wakeup(&ifp->if_start_waiters);
4742 	}
4743 #endif /* SKYWALK */
4744 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4745 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4746 		struct timespec delay_start_ts;
4747 		struct timespec pacemaker_ts;
4748 		struct timespec *ts = NULL;
4749 
4750 		/*
4751 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4752 		 * there are still packets in the send queue which haven't
4753 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4754 		 * until ifnet_start() is called again.
4755 		 */
4756 		if (ifp->if_start_pacemaker_time != 0) {
4757 			struct timespec now_ts;
4758 			uint64_t now;
4759 
4760 			nanouptime(&now_ts);
4761 			now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4762 
4763 			if (ifp->if_start_pacemaker_time != 0 &&
4764 			    ifp->if_start_pacemaker_time > now) {
4765 				pacemaker_ts.tv_sec = 0;
4766 				pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4767 
4768 				ts = &pacemaker_ts;
4769 				ifp->if_start_flags |= IFSF_NO_DELAY;
4770 				DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4771 				    uint64_t, pacemaker_ts.tv_nsec);
4772 			} else {
4773 				DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4774 				    uint64_t, now - ifp->if_start_pacemaker_time);
4775 				ifp->if_start_pacemaker_time = 0;
4776 				ifp->if_start_flags &= ~IFSF_NO_DELAY;
4777 			}
4778 		}
4779 
4780 		if (ts == NULL) {
4781 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4782 			    &ifp->if_start_cycle : NULL);
4783 		}
4784 
4785 		if (ts == NULL && ifp->if_start_delayed == 1) {
4786 			delay_start_ts.tv_sec = 0;
4787 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4788 			ts = &delay_start_ts;
4789 		}
4790 
4791 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4792 			ts = NULL;
4793 		}
4794 
4795 		if (__improbable(ts != NULL)) {
4796 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4797 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4798 		}
4799 
4800 		(void) assert_wait_deadline(&ifp->if_start_thread,
4801 		    THREAD_UNINT, deadline);
4802 		lck_mtx_unlock(&ifp->if_start_lock);
4803 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4804 		/* NOTREACHED */
4805 	} else {
4806 terminate:
4807 		/* interface is detached? */
4808 		ifnet_set_start_cycle(ifp, NULL);
4809 
4810 		ifp->if_start_pacemaker_time = 0;
4811 		/* clear if_start_thread to allow termination to continue */
4812 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4813 		ifp->if_start_thread = THREAD_NULL;
4814 		wakeup((caddr_t)&ifp->if_start_thread);
4815 		lck_mtx_unlock(&ifp->if_start_lock);
4816 
4817 		if (dlil_verbose) {
4818 			DLIL_PRINTF("%s: starter thread terminated\n",
4819 			    if_name(ifp));
4820 		}
4821 
4822 		/* for the extra refcnt from kernel_thread_start() */
4823 		thread_deallocate(current_thread());
4824 		/* this is the end */
4825 		thread_terminate(current_thread());
4826 		/* NOTREACHED */
4827 	}
4828 
4829 	/* must never get here */
4830 	VERIFY(0);
4831 	/* NOTREACHED */
4832 	__builtin_unreachable();
4833 }
4834 
4835 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4836 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4837 {
4838 	if (ts == NULL) {
4839 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4840 	} else {
4841 		*(&ifp->if_start_cycle) = *ts;
4842 	}
4843 
4844 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4845 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4846 		    if_name(ifp), ts->tv_nsec);
4847 	}
4848 }
4849 
4850 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4851 ifnet_poll_wakeup(struct ifnet *ifp)
4852 {
4853 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4854 
4855 	ifp->if_poll_req++;
4856 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4857 	    ifp->if_poll_thread != THREAD_NULL) {
4858 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4859 	}
4860 }
4861 
4862 void
ifnet_poll(struct ifnet * ifp)4863 ifnet_poll(struct ifnet *ifp)
4864 {
4865 	/*
4866 	 * If the poller thread is inactive, signal it to do work.
4867 	 */
4868 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4869 	ifnet_poll_wakeup(ifp);
4870 	lck_mtx_unlock(&ifp->if_poll_lock);
4871 }
4872 
4873 __attribute__((noreturn))
4874 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4875 ifnet_poll_thread_func(void *v, wait_result_t w)
4876 {
4877 #pragma unused(w)
4878 	char thread_name[MAXTHREADNAMESIZE];
4879 	struct ifnet *ifp = v;
4880 
4881 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4882 	VERIFY(current_thread() == ifp->if_poll_thread);
4883 
4884 	/* construct the name for this thread, and then apply it */
4885 	bzero(thread_name, sizeof(thread_name));
4886 	(void) snprintf(thread_name, sizeof(thread_name),
4887 	    "ifnet_poller_%s", ifp->if_xname);
4888 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4889 
4890 	lck_mtx_lock(&ifp->if_poll_lock);
4891 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4892 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4893 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4894 	/* wake up once to get out of embryonic state */
4895 	ifnet_poll_wakeup(ifp);
4896 	lck_mtx_unlock(&ifp->if_poll_lock);
4897 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4898 	/* NOTREACHED */
4899 	__builtin_unreachable();
4900 }
4901 
4902 __attribute__((noreturn))
4903 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4904 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4905 {
4906 	struct dlil_threading_info *inp;
4907 	struct ifnet *ifp = v;
4908 	struct ifnet_stat_increment_param s;
4909 	struct timespec start_time;
4910 
4911 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4912 
4913 	bzero(&s, sizeof(s));
4914 	net_timerclear(&start_time);
4915 
4916 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4917 	if (__improbable(wres == THREAD_INTERRUPTED ||
4918 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4919 		goto terminate;
4920 	}
4921 
4922 	inp = ifp->if_inp;
4923 	VERIFY(inp != NULL);
4924 
4925 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4926 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4927 		lck_mtx_unlock(&ifp->if_poll_lock);
4928 		ifnet_decr_pending_thread_count(ifp);
4929 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4930 		goto skip;
4931 	}
4932 
4933 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4934 
4935 	/*
4936 	 * Keep on servicing until no more request.
4937 	 */
4938 	for (;;) {
4939 		struct mbuf *m_head, *m_tail;
4940 		u_int32_t m_lim, m_cnt, m_totlen;
4941 		u_int16_t req = ifp->if_poll_req;
4942 
4943 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4944 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4945 		lck_mtx_unlock(&ifp->if_poll_lock);
4946 
4947 		/*
4948 		 * If no longer attached, there's nothing to do;
4949 		 * else hold an IO refcnt to prevent the interface
4950 		 * from being detached (will be released below.)
4951 		 */
4952 		if (!ifnet_is_attached(ifp, 1)) {
4953 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4954 			break;
4955 		}
4956 
4957 		if (dlil_verbose > 1) {
4958 			DLIL_PRINTF("%s: polling up to %d pkts, "
4959 			    "pkts avg %d max %d, wreq avg %d, "
4960 			    "bytes avg %d\n",
4961 			    if_name(ifp), m_lim,
4962 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4963 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4964 		}
4965 
4966 		/* invoke the driver's input poll routine */
4967 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4968 		&m_cnt, &m_totlen));
4969 
4970 		if (m_head != NULL) {
4971 			VERIFY(m_tail != NULL && m_cnt > 0);
4972 
4973 			if (dlil_verbose > 1) {
4974 				DLIL_PRINTF("%s: polled %d pkts, "
4975 				    "pkts avg %d max %d, wreq avg %d, "
4976 				    "bytes avg %d\n",
4977 				    if_name(ifp), m_cnt,
4978 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4979 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4980 			}
4981 
4982 			/* stats are required for extended variant */
4983 			s.packets_in = m_cnt;
4984 			s.bytes_in = m_totlen;
4985 
4986 			(void) ifnet_input_common(ifp, m_head, m_tail,
4987 			    &s, TRUE, TRUE);
4988 		} else {
4989 			if (dlil_verbose > 1) {
4990 				DLIL_PRINTF("%s: no packets, "
4991 				    "pkts avg %d max %d, wreq avg %d, "
4992 				    "bytes avg %d\n",
4993 				    if_name(ifp), ifp->if_rxpoll_pavg,
4994 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4995 				    ifp->if_rxpoll_bavg);
4996 			}
4997 
4998 			(void) ifnet_input_common(ifp, NULL, NULL,
4999 			    NULL, FALSE, TRUE);
5000 		}
5001 
5002 		/* Release the io ref count */
5003 		ifnet_decr_iorefcnt(ifp);
5004 
5005 		lck_mtx_lock_spin(&ifp->if_poll_lock);
5006 
5007 		/* if there's no pending request, we're done */
5008 		if (req == ifp->if_poll_req ||
5009 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5010 			break;
5011 		}
5012 	}
5013 skip:
5014 	ifp->if_poll_req = 0;
5015 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5016 
5017 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5018 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5019 		struct timespec *ts;
5020 
5021 		/*
5022 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5023 		 * until ifnet_poll() is called again.
5024 		 */
5025 		ts = &ifp->if_poll_cycle;
5026 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5027 			ts = NULL;
5028 		}
5029 
5030 		if (ts != NULL) {
5031 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5032 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5033 		}
5034 
5035 		(void) assert_wait_deadline(&ifp->if_poll_thread,
5036 		    THREAD_UNINT, deadline);
5037 		lck_mtx_unlock(&ifp->if_poll_lock);
5038 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5039 		/* NOTREACHED */
5040 	} else {
5041 terminate:
5042 		/* interface is detached (maybe while asleep)? */
5043 		ifnet_set_poll_cycle(ifp, NULL);
5044 
5045 		/* clear if_poll_thread to allow termination to continue */
5046 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
5047 		ifp->if_poll_thread = THREAD_NULL;
5048 		wakeup((caddr_t)&ifp->if_poll_thread);
5049 		lck_mtx_unlock(&ifp->if_poll_lock);
5050 
5051 		if (dlil_verbose) {
5052 			DLIL_PRINTF("%s: poller thread terminated\n",
5053 			    if_name(ifp));
5054 		}
5055 
5056 		/* for the extra refcnt from kernel_thread_start() */
5057 		thread_deallocate(current_thread());
5058 		/* this is the end */
5059 		thread_terminate(current_thread());
5060 		/* NOTREACHED */
5061 	}
5062 
5063 	/* must never get here */
5064 	VERIFY(0);
5065 	/* NOTREACHED */
5066 	__builtin_unreachable();
5067 }
5068 
5069 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5070 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5071 {
5072 	if (ts == NULL) {
5073 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5074 	} else {
5075 		*(&ifp->if_poll_cycle) = *ts;
5076 	}
5077 
5078 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5079 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5080 		    if_name(ifp), ts->tv_nsec);
5081 	}
5082 }
5083 
5084 void
ifnet_purge(struct ifnet * ifp)5085 ifnet_purge(struct ifnet *ifp)
5086 {
5087 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5088 		if_qflush_snd(ifp, false);
5089 	}
5090 }
5091 
5092 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5093 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5094 {
5095 	IFCQ_LOCK_ASSERT_HELD(ifq);
5096 
5097 	if (!(IFCQ_IS_READY(ifq))) {
5098 		return;
5099 	}
5100 
5101 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
5102 		struct tb_profile tb = {
5103 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
5104 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5105 		};
5106 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
5107 	}
5108 
5109 	ifclassq_update(ifq, ev);
5110 }
5111 
5112 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5113 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5114 {
5115 	switch (ev) {
5116 	case CLASSQ_EV_LINK_BANDWIDTH:
5117 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5118 			ifp->if_poll_update++;
5119 		}
5120 		break;
5121 
5122 	default:
5123 		break;
5124 	}
5125 }
5126 
5127 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5128 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5129 {
5130 	struct ifclassq *ifq;
5131 	u_int32_t omodel;
5132 	errno_t err;
5133 
5134 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5135 		return EINVAL;
5136 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5137 		return ENXIO;
5138 	}
5139 
5140 	ifq = ifp->if_snd;
5141 	IFCQ_LOCK(ifq);
5142 	omodel = ifp->if_output_sched_model;
5143 	ifp->if_output_sched_model = model;
5144 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5145 		ifp->if_output_sched_model = omodel;
5146 	}
5147 	IFCQ_UNLOCK(ifq);
5148 
5149 	return err;
5150 }
5151 
5152 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5153 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5154 {
5155 	if (ifp == NULL) {
5156 		return EINVAL;
5157 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5158 		return ENXIO;
5159 	}
5160 
5161 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5162 
5163 	return 0;
5164 }
5165 
5166 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5167 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5168 {
5169 	if (ifp == NULL || maxqlen == NULL) {
5170 		return EINVAL;
5171 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5172 		return ENXIO;
5173 	}
5174 
5175 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5176 
5177 	return 0;
5178 }
5179 
5180 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5181 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5182 {
5183 	errno_t err;
5184 
5185 	if (ifp == NULL || pkts == NULL) {
5186 		err = EINVAL;
5187 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5188 		err = ENXIO;
5189 	} else {
5190 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5191 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
5192 	}
5193 
5194 	return err;
5195 }
5196 
5197 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5198 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5199     u_int32_t *pkts, u_int32_t *bytes)
5200 {
5201 	errno_t err;
5202 
5203 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5204 	    (pkts == NULL && bytes == NULL)) {
5205 		err = EINVAL;
5206 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5207 		err = ENXIO;
5208 	} else {
5209 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5210 		    pkts, bytes);
5211 	}
5212 
5213 	return err;
5214 }
5215 
5216 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5217 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5218 {
5219 	struct dlil_threading_info *inp;
5220 
5221 	if (ifp == NULL) {
5222 		return EINVAL;
5223 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5224 		return ENXIO;
5225 	}
5226 
5227 	if (maxqlen == 0) {
5228 		maxqlen = if_rcvq_maxlen;
5229 	} else if (maxqlen < IF_RCVQ_MINLEN) {
5230 		maxqlen = IF_RCVQ_MINLEN;
5231 	}
5232 
5233 	inp = ifp->if_inp;
5234 	lck_mtx_lock(&inp->dlth_lock);
5235 	qlimit(&inp->dlth_pkts) = maxqlen;
5236 	lck_mtx_unlock(&inp->dlth_lock);
5237 
5238 	return 0;
5239 }
5240 
5241 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5242 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5243 {
5244 	struct dlil_threading_info *inp;
5245 
5246 	if (ifp == NULL || maxqlen == NULL) {
5247 		return EINVAL;
5248 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5249 		return ENXIO;
5250 	}
5251 
5252 	inp = ifp->if_inp;
5253 	lck_mtx_lock(&inp->dlth_lock);
5254 	*maxqlen = qlimit(&inp->dlth_pkts);
5255 	lck_mtx_unlock(&inp->dlth_lock);
5256 	return 0;
5257 }
5258 
5259 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5260 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5261     uint16_t delay_timeout)
5262 {
5263 	if (delay_qlen > 0 && delay_timeout > 0) {
5264 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5265 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5266 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
5267 		/* convert timeout to nanoseconds */
5268 		ifp->if_start_delay_timeout *= 1000;
5269 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5270 		    ifp->if_xname, (uint32_t)delay_qlen,
5271 		    (uint32_t)delay_timeout);
5272 	} else {
5273 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5274 	}
5275 }
5276 
5277 /*
5278  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5279  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5280  * buf holds the full header.
5281  */
5282 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5283 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5284 {
5285 	struct ip *ip;
5286 	struct ip6_hdr *ip6;
5287 	uint8_t lbuf[64] __attribute__((aligned(8)));
5288 	uint8_t *p = buf;
5289 
5290 	if (ip_ver == IPVERSION) {
5291 		uint8_t old_tos;
5292 		uint32_t sum;
5293 
5294 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5295 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5296 			bcopy(buf, lbuf, sizeof(struct ip));
5297 			p = lbuf;
5298 		}
5299 		ip = (struct ip *)(void *)p;
5300 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5301 			return;
5302 		}
5303 
5304 		DTRACE_IP1(clear__v4, struct ip *, ip);
5305 		old_tos = ip->ip_tos;
5306 		ip->ip_tos &= IPTOS_ECN_MASK;
5307 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5308 		sum = (sum >> 16) + (sum & 0xffff);
5309 		ip->ip_sum = (uint16_t)(sum & 0xffff);
5310 
5311 		if (__improbable(p == lbuf)) {
5312 			bcopy(lbuf, buf, sizeof(struct ip));
5313 		}
5314 	} else {
5315 		uint32_t flow;
5316 		ASSERT(ip_ver == IPV6_VERSION);
5317 
5318 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5319 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5320 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5321 			p = lbuf;
5322 		}
5323 		ip6 = (struct ip6_hdr *)(void *)p;
5324 		flow = ntohl(ip6->ip6_flow);
5325 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5326 			return;
5327 		}
5328 
5329 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5330 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5331 
5332 		if (__improbable(p == lbuf)) {
5333 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5334 		}
5335 	}
5336 }
5337 
5338 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5339 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5340     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5341 {
5342 #if SKYWALK
5343 	volatile struct sk_nexusadv *nxadv = NULL;
5344 #endif /* SKYWALK */
5345 	volatile uint64_t *fg_ts = NULL;
5346 	volatile uint64_t *rt_ts = NULL;
5347 	struct timespec now;
5348 	u_int64_t now_nsec = 0;
5349 	int error = 0;
5350 	uint8_t *mcast_buf = NULL;
5351 	uint8_t ip_ver;
5352 	uint32_t pktlen;
5353 
5354 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
5355 #if SKYWALK
5356 	/*
5357 	 * If attached to flowswitch, grab pointers to the
5358 	 * timestamp variables in the nexus advisory region.
5359 	 */
5360 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5361 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5362 		fg_ts = &nxadv->nxadv_fg_sendts;
5363 		rt_ts = &nxadv->nxadv_rt_sendts;
5364 	}
5365 #endif /* SKYWALK */
5366 
5367 	/*
5368 	 * If packet already carries a timestamp, either from dlil_output()
5369 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
5370 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5371 	 * the timestamp value is used internally there.
5372 	 */
5373 	switch (p->cp_ptype) {
5374 	case QP_MBUF:
5375 #if SKYWALK
5376 		/*
5377 		 * Valid only for non-native (compat) Skywalk interface.
5378 		 * If the data source uses packet, caller must convert
5379 		 * it to mbuf first prior to calling this routine.
5380 		 */
5381 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5382 #endif /* SKYWALK */
5383 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5384 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5385 
5386 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5387 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5388 			nanouptime(&now);
5389 			net_timernsec(&now, &now_nsec);
5390 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5391 		}
5392 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5393 		/*
5394 		 * If the packet service class is not background,
5395 		 * update the timestamp to indicate recent activity
5396 		 * on a foreground socket.
5397 		 */
5398 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5399 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5400 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5401 			    PKTF_SO_BACKGROUND)) {
5402 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5403 				if (fg_ts != NULL) {
5404 					*fg_ts = (uint32_t)_net_uptime;
5405 				}
5406 			}
5407 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5408 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5409 				if (rt_ts != NULL) {
5410 					*rt_ts = (uint32_t)_net_uptime;
5411 				}
5412 			}
5413 		}
5414 		pktlen = m_pktlen(p->cp_mbuf);
5415 
5416 		/*
5417 		 * Some Wi-Fi AP implementations do not correctly handle
5418 		 * multicast IP packets with DSCP bits set (radr://9331522).
5419 		 * As a workaround we clear the DSCP bits but keep service
5420 		 * class (rdar://51507725).
5421 		 */
5422 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5423 		    IFNET_IS_WIFI_INFRA(ifp)) {
5424 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5425 			struct ether_header *eh;
5426 			boolean_t pullup = FALSE;
5427 			uint16_t etype;
5428 
5429 			if (__improbable(len < sizeof(struct ether_header))) {
5430 				DTRACE_IP1(small__ether, size_t, len);
5431 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5432 				    sizeof(struct ether_header))) == NULL) {
5433 					return ENOMEM;
5434 				}
5435 			}
5436 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5437 			etype = ntohs(eh->ether_type);
5438 			if (etype == ETHERTYPE_IP) {
5439 				hlen = sizeof(struct ether_header) +
5440 				    sizeof(struct ip);
5441 				if (len < hlen) {
5442 					DTRACE_IP1(small__v4, size_t, len);
5443 					pullup = TRUE;
5444 				}
5445 				ip_ver = IPVERSION;
5446 			} else if (etype == ETHERTYPE_IPV6) {
5447 				hlen = sizeof(struct ether_header) +
5448 				    sizeof(struct ip6_hdr);
5449 				if (len < hlen) {
5450 					DTRACE_IP1(small__v6, size_t, len);
5451 					pullup = TRUE;
5452 				}
5453 				ip_ver = IPV6_VERSION;
5454 			} else {
5455 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5456 				break;
5457 			}
5458 			if (pullup) {
5459 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5460 				    NULL) {
5461 					return ENOMEM;
5462 				}
5463 
5464 				eh = (struct ether_header *)mbuf_data(
5465 					p->cp_mbuf);
5466 			}
5467 			mcast_buf = (uint8_t *)(eh + 1);
5468 			/*
5469 			 * ifnet_mcast_clear_dscp() will finish the work below.
5470 			 * Note that the pullups above ensure that mcast_buf
5471 			 * points to a full IP header.
5472 			 */
5473 		}
5474 		break;
5475 
5476 #if SKYWALK
5477 	case QP_PACKET:
5478 		/*
5479 		 * Valid only for native Skywalk interface.  If the data
5480 		 * source uses mbuf, caller must convert it to packet first
5481 		 * prior to calling this routine.
5482 		 */
5483 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5484 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5485 		    p->cp_kpkt->pkt_timestamp == 0) {
5486 			nanouptime(&now);
5487 			net_timernsec(&now, &now_nsec);
5488 			p->cp_kpkt->pkt_timestamp = now_nsec;
5489 		}
5490 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5491 		/*
5492 		 * If the packet service class is not background,
5493 		 * update the timestamps on the interface, as well as
5494 		 * the ones in nexus-wide advisory to indicate recent
5495 		 * activity on a foreground flow.
5496 		 */
5497 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5498 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5499 			if (fg_ts != NULL) {
5500 				*fg_ts = (uint32_t)_net_uptime;
5501 			}
5502 		}
5503 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5504 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5505 			if (rt_ts != NULL) {
5506 				*rt_ts = (uint32_t)_net_uptime;
5507 			}
5508 		}
5509 		pktlen = p->cp_kpkt->pkt_length;
5510 
5511 		/*
5512 		 * Some Wi-Fi AP implementations do not correctly handle
5513 		 * multicast IP packets with DSCP bits set (radr://9331522).
5514 		 * As a workaround we clear the DSCP bits but keep service
5515 		 * class (rdar://51507725).
5516 		 */
5517 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5518 		    IFNET_IS_WIFI_INFRA(ifp)) {
5519 			uint8_t *baddr;
5520 			struct ether_header *eh;
5521 			uint16_t etype;
5522 
5523 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5524 			baddr += p->cp_kpkt->pkt_headroom;
5525 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5526 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5527 				    p->cp_kpkt);
5528 				break;
5529 			}
5530 			eh = (struct ether_header *)(void *)baddr;
5531 			etype = ntohs(eh->ether_type);
5532 			if (etype == ETHERTYPE_IP) {
5533 				if (pktlen < sizeof(struct ether_header) +
5534 				    sizeof(struct ip)) {
5535 					DTRACE_IP1(pkt__small__v4, uint32_t,
5536 					    pktlen);
5537 					break;
5538 				}
5539 				ip_ver = IPVERSION;
5540 			} else if (etype == ETHERTYPE_IPV6) {
5541 				if (pktlen < sizeof(struct ether_header) +
5542 				    sizeof(struct ip6_hdr)) {
5543 					DTRACE_IP1(pkt__small__v6, uint32_t,
5544 					    pktlen);
5545 					break;
5546 				}
5547 				ip_ver = IPV6_VERSION;
5548 			} else {
5549 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5550 				    etype);
5551 				break;
5552 			}
5553 			mcast_buf = (uint8_t *)(eh + 1);
5554 			/*
5555 			 * ifnet_mcast_clear_dscp() will finish the work below.
5556 			 * The checks above verify that the IP header is in the
5557 			 * first buflet.
5558 			 */
5559 		}
5560 		break;
5561 #endif /* SKYWALK */
5562 
5563 	default:
5564 		VERIFY(0);
5565 		/* NOTREACHED */
5566 		__builtin_unreachable();
5567 	}
5568 
5569 	if (mcast_buf != NULL) {
5570 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5571 	}
5572 
5573 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5574 		if (now_nsec == 0) {
5575 			nanouptime(&now);
5576 			net_timernsec(&now, &now_nsec);
5577 		}
5578 		/*
5579 		 * If the driver chose to delay start callback for
5580 		 * coalescing multiple packets, Then use the following
5581 		 * heuristics to make sure that start callback will
5582 		 * be delayed only when bulk data transfer is detected.
5583 		 * 1. number of packets enqueued in (delay_win * 2) is
5584 		 * greater than or equal to the delay qlen.
5585 		 * 2. If delay_start is enabled it will stay enabled for
5586 		 * another 10 idle windows. This is to take into account
5587 		 * variable RTT and burst traffic.
5588 		 * 3. If the time elapsed since last enqueue is more
5589 		 * than 200ms we disable delaying start callback. This is
5590 		 * is to take idle time into account.
5591 		 */
5592 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5593 		if (ifp->if_start_delay_swin > 0) {
5594 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5595 				ifp->if_start_delay_cnt++;
5596 			} else if ((now_nsec - ifp->if_start_delay_swin)
5597 			    >= (200 * 1000 * 1000)) {
5598 				ifp->if_start_delay_swin = now_nsec;
5599 				ifp->if_start_delay_cnt = 1;
5600 				ifp->if_start_delay_idle = 0;
5601 				if (ifp->if_eflags & IFEF_DELAY_START) {
5602 					if_clear_eflags(ifp, IFEF_DELAY_START);
5603 					ifnet_delay_start_disabled_increment();
5604 				}
5605 			} else {
5606 				if (ifp->if_start_delay_cnt >=
5607 				    ifp->if_start_delay_qlen) {
5608 					if_set_eflags(ifp, IFEF_DELAY_START);
5609 					ifp->if_start_delay_idle = 0;
5610 				} else {
5611 					if (ifp->if_start_delay_idle >= 10) {
5612 						if_clear_eflags(ifp,
5613 						    IFEF_DELAY_START);
5614 						ifnet_delay_start_disabled_increment();
5615 					} else {
5616 						ifp->if_start_delay_idle++;
5617 					}
5618 				}
5619 				ifp->if_start_delay_swin = now_nsec;
5620 				ifp->if_start_delay_cnt = 1;
5621 			}
5622 		} else {
5623 			ifp->if_start_delay_swin = now_nsec;
5624 			ifp->if_start_delay_cnt = 1;
5625 			ifp->if_start_delay_idle = 0;
5626 			if_clear_eflags(ifp, IFEF_DELAY_START);
5627 		}
5628 	} else {
5629 		if_clear_eflags(ifp, IFEF_DELAY_START);
5630 	}
5631 
5632 	/* enqueue the packet (caller consumes object) */
5633 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5634 	    1, pktlen, pdrop);
5635 
5636 	/*
5637 	 * Tell the driver to start dequeueing; do this even when the queue
5638 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5639 	 * be dequeueing from other unsuspended queues.
5640 	 */
5641 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5642 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5643 		ifnet_start(ifp);
5644 	}
5645 
5646 	return error;
5647 }
5648 
5649 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5650 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5651     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5652     boolean_t flush, boolean_t *pdrop)
5653 {
5654 	int error;
5655 
5656 	/* enqueue the packet (caller consumes object) */
5657 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5658 	    cnt, bytes, pdrop);
5659 
5660 	/*
5661 	 * Tell the driver to start dequeueing; do this even when the queue
5662 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5663 	 * be dequeueing from other unsuspended queues.
5664 	 */
5665 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5666 		ifnet_start(ifp);
5667 	}
5668 	return error;
5669 }
5670 
5671 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5672 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5673 {
5674 	struct ifnet *ifp = handle;
5675 	boolean_t pdrop;        /* dummy */
5676 	uint32_t i;
5677 
5678 	ASSERT(n_pkts >= 1);
5679 	for (i = 0; i < n_pkts - 1; i++) {
5680 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5681 		    FALSE, &pdrop);
5682 	}
5683 	/* flush with the last packet */
5684 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5685 	    TRUE, &pdrop);
5686 
5687 	return 0;
5688 }
5689 
5690 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5691 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5692     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5693 {
5694 	if (ifp->if_output_netem != NULL) {
5695 		bool drop;
5696 		errno_t error;
5697 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5698 		*pdrop = drop ? TRUE : FALSE;
5699 		return error;
5700 	} else {
5701 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5702 	}
5703 }
5704 
5705 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5706 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5707 {
5708 	boolean_t pdrop;
5709 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5710 }
5711 
5712 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5713 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5714     boolean_t *pdrop)
5715 {
5716 	classq_pkt_t pkt;
5717 
5718 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5719 	    m->m_nextpkt != NULL) {
5720 		if (m != NULL) {
5721 			m_freem_list(m);
5722 			*pdrop = TRUE;
5723 		}
5724 		return EINVAL;
5725 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5726 	    !IF_FULLY_ATTACHED(ifp)) {
5727 		/* flag tested without lock for performance */
5728 		m_freem(m);
5729 		*pdrop = TRUE;
5730 		return ENXIO;
5731 	} else if (!(ifp->if_flags & IFF_UP)) {
5732 		m_freem(m);
5733 		*pdrop = TRUE;
5734 		return ENETDOWN;
5735 	}
5736 
5737 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5738 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5739 }
5740 
5741 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5742 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5743     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5744     boolean_t *pdrop)
5745 {
5746 	classq_pkt_t head, tail;
5747 
5748 	ASSERT(m_head != NULL);
5749 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5750 	ASSERT(m_tail != NULL);
5751 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5752 	ASSERT(ifp != NULL);
5753 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5754 
5755 	if (!IF_FULLY_ATTACHED(ifp)) {
5756 		/* flag tested without lock for performance */
5757 		m_freem_list(m_head);
5758 		*pdrop = TRUE;
5759 		return ENXIO;
5760 	} else if (!(ifp->if_flags & IFF_UP)) {
5761 		m_freem_list(m_head);
5762 		*pdrop = TRUE;
5763 		return ENETDOWN;
5764 	}
5765 
5766 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5767 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5768 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5769 	           flush, pdrop);
5770 }
5771 
5772 #if SKYWALK
5773 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5774 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5775     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5776 {
5777 	classq_pkt_t pkt;
5778 
5779 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5780 
5781 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5782 		if (kpkt != NULL) {
5783 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5784 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5785 			*pdrop = TRUE;
5786 		}
5787 		return EINVAL;
5788 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5789 	    !IF_FULLY_ATTACHED(ifp))) {
5790 		/* flag tested without lock for performance */
5791 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5792 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5793 		*pdrop = TRUE;
5794 		return ENXIO;
5795 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5796 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5797 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5798 		*pdrop = TRUE;
5799 		return ENETDOWN;
5800 	}
5801 
5802 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5803 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5804 }
5805 
5806 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5807 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5808     boolean_t flush, boolean_t *pdrop)
5809 {
5810 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5811 }
5812 
5813 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5814 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5815     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5816 {
5817 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5818 }
5819 
5820 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5821 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5822     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5823     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5824 {
5825 	classq_pkt_t head, tail;
5826 
5827 	ASSERT(k_head != NULL);
5828 	ASSERT(k_tail != NULL);
5829 	ASSERT(ifp != NULL);
5830 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5831 
5832 	if (!IF_FULLY_ATTACHED(ifp)) {
5833 		/* flag tested without lock for performance */
5834 		pp_free_packet_chain(k_head, NULL);
5835 		*pdrop = TRUE;
5836 		return ENXIO;
5837 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5838 		pp_free_packet_chain(k_head, NULL);
5839 		*pdrop = TRUE;
5840 		return ENETDOWN;
5841 	}
5842 
5843 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5844 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5845 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5846 	           flush, pdrop);
5847 }
5848 
5849 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5850 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5851     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5852     boolean_t *pdrop)
5853 {
5854 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5855 	           cnt, bytes, flush, pdrop);
5856 }
5857 
5858 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5859 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5860     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5861     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5862 {
5863 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5864 	           cnt, bytes, flush, pdrop);
5865 }
5866 #endif /* SKYWALK */
5867 
5868 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5869 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5870 {
5871 	errno_t rc;
5872 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5873 
5874 	if (ifp == NULL || mp == NULL) {
5875 		return EINVAL;
5876 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5877 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5878 		return ENXIO;
5879 	}
5880 	if (!ifnet_is_attached(ifp, 1)) {
5881 		return ENXIO;
5882 	}
5883 
5884 #if SKYWALK
5885 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5886 #endif /* SKYWALK */
5887 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5888 	    &pkt, NULL, NULL, NULL, 0);
5889 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5890 	ifnet_decr_iorefcnt(ifp);
5891 	*mp = pkt.cp_mbuf;
5892 	return rc;
5893 }
5894 
5895 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5896 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5897     struct mbuf **mp)
5898 {
5899 	errno_t rc;
5900 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5901 
5902 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5903 		return EINVAL;
5904 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5905 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5906 		return ENXIO;
5907 	}
5908 	if (!ifnet_is_attached(ifp, 1)) {
5909 		return ENXIO;
5910 	}
5911 
5912 #if SKYWALK
5913 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5914 #endif /* SKYWALK */
5915 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5916 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5917 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5918 	ifnet_decr_iorefcnt(ifp);
5919 	*mp = pkt.cp_mbuf;
5920 	return rc;
5921 }
5922 
5923 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5924 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5925     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5926 {
5927 	errno_t rc;
5928 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5929 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5930 
5931 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5932 		return EINVAL;
5933 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5934 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5935 		return ENXIO;
5936 	}
5937 	if (!ifnet_is_attached(ifp, 1)) {
5938 		return ENXIO;
5939 	}
5940 
5941 #if SKYWALK
5942 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5943 #endif /* SKYWALK */
5944 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5945 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5946 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5947 	ifnet_decr_iorefcnt(ifp);
5948 	*head = pkt_head.cp_mbuf;
5949 	if (tail != NULL) {
5950 		*tail = pkt_tail.cp_mbuf;
5951 	}
5952 	return rc;
5953 }
5954 
5955 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5956 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5957     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5958 {
5959 	errno_t rc;
5960 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5961 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5962 
5963 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5964 		return EINVAL;
5965 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5966 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5967 		return ENXIO;
5968 	}
5969 	if (!ifnet_is_attached(ifp, 1)) {
5970 		return ENXIO;
5971 	}
5972 
5973 #if SKYWALK
5974 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5975 #endif /* SKYWALK */
5976 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5977 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5978 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5979 	ifnet_decr_iorefcnt(ifp);
5980 	*head = pkt_head.cp_mbuf;
5981 	if (tail != NULL) {
5982 		*tail = pkt_tail.cp_mbuf;
5983 	}
5984 	return rc;
5985 }
5986 
5987 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5988 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5989     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5990     u_int32_t *len)
5991 {
5992 	errno_t rc;
5993 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5994 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5995 
5996 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5997 	    !MBUF_VALID_SC(sc)) {
5998 		return EINVAL;
5999 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6000 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6001 		return ENXIO;
6002 	}
6003 	if (!ifnet_is_attached(ifp, 1)) {
6004 		return ENXIO;
6005 	}
6006 
6007 #if SKYWALK
6008 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6009 #endif /* SKYWALK */
6010 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6011 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6012 	    cnt, len, 0);
6013 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6014 	ifnet_decr_iorefcnt(ifp);
6015 	*head = pkt_head.cp_mbuf;
6016 	if (tail != NULL) {
6017 		*tail = pkt_tail.cp_mbuf;
6018 	}
6019 	return rc;
6020 }
6021 
6022 #if XNU_TARGET_OS_OSX
6023 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6024 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6025     const struct sockaddr *dest, const char *dest_linkaddr,
6026     const char *frame_type, u_int32_t *pre, u_int32_t *post)
6027 {
6028 	if (pre != NULL) {
6029 		*pre = 0;
6030 	}
6031 	if (post != NULL) {
6032 		*post = 0;
6033 	}
6034 
6035 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6036 }
6037 #endif /* XNU_TARGET_OS_OSX */
6038 
6039 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6040 packet_has_vlan_tag(struct mbuf * m)
6041 {
6042 	u_int   tag = 0;
6043 
6044 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6045 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6046 		if (tag == 0) {
6047 			/* the packet is just priority-tagged, clear the bit */
6048 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6049 		}
6050 	}
6051 	return tag != 0;
6052 }
6053 
6054 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6055 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6056     char **frame_header_p, protocol_family_t protocol_family)
6057 {
6058 	boolean_t               is_vlan_packet = FALSE;
6059 	struct ifnet_filter     *filter;
6060 	struct mbuf             *m = *m_p;
6061 
6062 	is_vlan_packet = packet_has_vlan_tag(m);
6063 
6064 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6065 		return 0;
6066 	}
6067 
6068 	/*
6069 	 * Pass the inbound packet to the interface filters
6070 	 */
6071 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6072 	/* prevent filter list from changing in case we drop the lock */
6073 	if_flt_monitor_busy(ifp);
6074 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6075 		int result;
6076 
6077 		/* exclude VLAN packets from external filters PR-3586856 */
6078 		if (is_vlan_packet &&
6079 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6080 			continue;
6081 		}
6082 
6083 		if (!filter->filt_skip && filter->filt_input != NULL &&
6084 		    (filter->filt_protocol == 0 ||
6085 		    filter->filt_protocol == protocol_family)) {
6086 			lck_mtx_unlock(&ifp->if_flt_lock);
6087 
6088 			result = (*filter->filt_input)(filter->filt_cookie,
6089 			    ifp, protocol_family, m_p, frame_header_p);
6090 
6091 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6092 			if (result != 0) {
6093 				/* we're done with the filter list */
6094 				if_flt_monitor_unbusy(ifp);
6095 				lck_mtx_unlock(&ifp->if_flt_lock);
6096 				return result;
6097 			}
6098 		}
6099 	}
6100 	/* we're done with the filter list */
6101 	if_flt_monitor_unbusy(ifp);
6102 	lck_mtx_unlock(&ifp->if_flt_lock);
6103 
6104 	/*
6105 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6106 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6107 	 */
6108 	if (*m_p != NULL) {
6109 		(*m_p)->m_flags &= ~M_PROTO1;
6110 	}
6111 
6112 	return 0;
6113 }
6114 
6115 __attribute__((noinline))
6116 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6117 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6118     protocol_family_t protocol_family)
6119 {
6120 	boolean_t               is_vlan_packet;
6121 	struct ifnet_filter     *filter;
6122 	struct mbuf             *m = *m_p;
6123 
6124 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6125 		return 0;
6126 	}
6127 	is_vlan_packet = packet_has_vlan_tag(m);
6128 
6129 	/*
6130 	 * Pass the outbound packet to the interface filters
6131 	 */
6132 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6133 	/* prevent filter list from changing in case we drop the lock */
6134 	if_flt_monitor_busy(ifp);
6135 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6136 		int result;
6137 
6138 		/* exclude VLAN packets from external filters PR-3586856 */
6139 		if (is_vlan_packet &&
6140 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6141 			continue;
6142 		}
6143 
6144 		if (!filter->filt_skip && filter->filt_output != NULL &&
6145 		    (filter->filt_protocol == 0 ||
6146 		    filter->filt_protocol == protocol_family)) {
6147 			lck_mtx_unlock(&ifp->if_flt_lock);
6148 
6149 			result = filter->filt_output(filter->filt_cookie, ifp,
6150 			    protocol_family, m_p);
6151 
6152 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6153 			if (result != 0) {
6154 				/* we're done with the filter list */
6155 				if_flt_monitor_unbusy(ifp);
6156 				lck_mtx_unlock(&ifp->if_flt_lock);
6157 				return result;
6158 			}
6159 		}
6160 	}
6161 	/* we're done with the filter list */
6162 	if_flt_monitor_unbusy(ifp);
6163 	lck_mtx_unlock(&ifp->if_flt_lock);
6164 
6165 	return 0;
6166 }
6167 
6168 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6169 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6170 {
6171 	int error;
6172 
6173 	if (ifproto->proto_kpi == kProtoKPI_v1) {
6174 		/* Version 1 protocols get one packet at a time */
6175 		while (m != NULL) {
6176 			char *  frame_header;
6177 			mbuf_t  next_packet;
6178 
6179 			next_packet = m->m_nextpkt;
6180 			m->m_nextpkt = NULL;
6181 			frame_header = m->m_pkthdr.pkt_hdr;
6182 			m->m_pkthdr.pkt_hdr = NULL;
6183 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6184 			    ifproto->protocol_family, m, frame_header);
6185 			if (error != 0 && error != EJUSTRETURN) {
6186 				m_freem(m);
6187 			}
6188 			m = next_packet;
6189 		}
6190 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
6191 		/* Version 2 protocols support packet lists */
6192 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6193 		    ifproto->protocol_family, m);
6194 		if (error != 0 && error != EJUSTRETURN) {
6195 			m_freem_list(m);
6196 		}
6197 	}
6198 }
6199 
6200 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6201 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6202     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6203 {
6204 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6205 
6206 	if (s->packets_in != 0) {
6207 		d->packets_in += s->packets_in;
6208 	}
6209 	if (s->bytes_in != 0) {
6210 		d->bytes_in += s->bytes_in;
6211 	}
6212 	if (s->errors_in != 0) {
6213 		d->errors_in += s->errors_in;
6214 	}
6215 
6216 	if (s->packets_out != 0) {
6217 		d->packets_out += s->packets_out;
6218 	}
6219 	if (s->bytes_out != 0) {
6220 		d->bytes_out += s->bytes_out;
6221 	}
6222 	if (s->errors_out != 0) {
6223 		d->errors_out += s->errors_out;
6224 	}
6225 
6226 	if (s->collisions != 0) {
6227 		d->collisions += s->collisions;
6228 	}
6229 	if (s->dropped != 0) {
6230 		d->dropped += s->dropped;
6231 	}
6232 
6233 	if (poll) {
6234 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6235 	}
6236 }
6237 
6238 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6239 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6240 {
6241 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6242 
6243 	/*
6244 	 * Use of atomic operations is unavoidable here because
6245 	 * these stats may also be incremented elsewhere via KPIs.
6246 	 */
6247 	if (s->packets_in != 0) {
6248 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6249 		s->packets_in = 0;
6250 	}
6251 	if (s->bytes_in != 0) {
6252 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6253 		s->bytes_in = 0;
6254 	}
6255 	if (s->errors_in != 0) {
6256 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6257 		s->errors_in = 0;
6258 	}
6259 
6260 	if (s->packets_out != 0) {
6261 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6262 		s->packets_out = 0;
6263 	}
6264 	if (s->bytes_out != 0) {
6265 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6266 		s->bytes_out = 0;
6267 	}
6268 	if (s->errors_out != 0) {
6269 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6270 		s->errors_out = 0;
6271 	}
6272 
6273 	if (s->collisions != 0) {
6274 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6275 		s->collisions = 0;
6276 	}
6277 	if (s->dropped != 0) {
6278 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6279 		s->dropped = 0;
6280 	}
6281 
6282 	/*
6283 	 * No need for atomic operations as they are modified here
6284 	 * only from within the DLIL input thread context.
6285 	 */
6286 	if (ifp->if_poll_tstats.packets != 0) {
6287 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6288 		ifp->if_poll_tstats.packets = 0;
6289 	}
6290 	if (ifp->if_poll_tstats.bytes != 0) {
6291 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6292 		ifp->if_poll_tstats.bytes = 0;
6293 	}
6294 
6295 	return ifp->if_data_threshold != 0;
6296 }
6297 
6298 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6299 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6300 {
6301 	return dlil_input_packet_list_common(ifp, m, 0,
6302 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6303 }
6304 
6305 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6306 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6307     u_int32_t cnt, ifnet_model_t mode)
6308 {
6309 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6310 }
6311 
6312 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6313 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6314     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6315 {
6316 	int error = 0;
6317 	protocol_family_t protocol_family;
6318 	mbuf_t next_packet;
6319 	ifnet_t ifp = ifp_param;
6320 	char *frame_header = NULL;
6321 	struct if_proto *last_ifproto = NULL;
6322 	mbuf_t pkt_first = NULL;
6323 	mbuf_t *pkt_next = NULL;
6324 	u_int32_t poll_thresh = 0, poll_ival = 0;
6325 	int iorefcnt = 0;
6326 
6327 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6328 
6329 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6330 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6331 		poll_thresh = cnt;
6332 	}
6333 
6334 	while (m != NULL) {
6335 		struct if_proto *ifproto = NULL;
6336 		uint32_t pktf_mask;     /* pkt flags to preserve */
6337 
6338 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6339 
6340 		if (ifp_param == NULL) {
6341 			ifp = m->m_pkthdr.rcvif;
6342 		}
6343 
6344 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6345 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6346 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6347 			ifnet_poll(ifp);
6348 		}
6349 
6350 		/* Check if this mbuf looks valid */
6351 		MBUF_INPUT_CHECK(m, ifp);
6352 
6353 		next_packet = m->m_nextpkt;
6354 		m->m_nextpkt = NULL;
6355 		frame_header = m->m_pkthdr.pkt_hdr;
6356 		m->m_pkthdr.pkt_hdr = NULL;
6357 
6358 		/*
6359 		 * Get an IO reference count if the interface is not
6360 		 * loopback (lo0) and it is attached; lo0 never goes
6361 		 * away, so optimize for that.
6362 		 */
6363 		if (ifp != lo_ifp) {
6364 			/* iorefcnt is 0 if it hasn't been taken yet */
6365 			if (iorefcnt == 0) {
6366 				if (!ifnet_datamov_begin(ifp)) {
6367 					m_freem(m);
6368 					goto next;
6369 				}
6370 			}
6371 			iorefcnt = 1;
6372 			/*
6373 			 * Preserve the time stamp and skip pktap flags.
6374 			 */
6375 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6376 		} else {
6377 			/*
6378 			 * If this arrived on lo0, preserve interface addr
6379 			 * info to allow for connectivity between loopback
6380 			 * and local interface addresses.
6381 			 */
6382 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6383 		}
6384 		pktf_mask |= PKTF_WAKE_PKT;
6385 
6386 		/* make sure packet comes in clean */
6387 		m_classifier_init(m, pktf_mask);
6388 
6389 		ifp_inc_traffic_class_in(ifp, m);
6390 
6391 		/* find which protocol family this packet is for */
6392 		ifnet_lock_shared(ifp);
6393 		error = (*ifp->if_demux)(ifp, m, frame_header,
6394 		    &protocol_family);
6395 		ifnet_lock_done(ifp);
6396 		if (error != 0) {
6397 			if (error == EJUSTRETURN) {
6398 				goto next;
6399 			}
6400 			protocol_family = 0;
6401 		}
6402 
6403 #if (DEVELOPMENT || DEBUG)
6404 		/*
6405 		 * For testing we do not care about broadcast and multicast packets as
6406 		 * they are not as controllable as unicast traffic
6407 		 */
6408 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6409 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6410 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6411 				/*
6412 				 * This is a one-shot command
6413 				 */
6414 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6415 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6416 			}
6417 		}
6418 #endif /* (DEVELOPMENT || DEBUG) */
6419 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6420 			char buffer[64];
6421 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6422 
6423 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6424 			    ifp->if_xname, m_pktlen(m));
6425 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6426 				log_hexdump(buffer, buflen);
6427 			}
6428 		}
6429 
6430 		pktap_input(ifp, protocol_family, m, frame_header);
6431 
6432 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6433 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6434 		    ifp->if_type == IFT_CELLULAR) {
6435 			m_freem(m);
6436 			ip6stat.ip6s_clat464_in_v4_drop++;
6437 			goto next;
6438 		}
6439 
6440 		/* Translate the packet if it is received on CLAT interface */
6441 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6442 		    && dlil_is_clat_needed(protocol_family, m)) {
6443 			char *data = NULL;
6444 			struct ether_header eh;
6445 			struct ether_header *ehp = NULL;
6446 
6447 			if (ifp->if_type == IFT_ETHER) {
6448 				ehp = (struct ether_header *)(void *)frame_header;
6449 				/* Skip RX Ethernet packets if they are not IPV6 */
6450 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6451 					goto skip_clat;
6452 				}
6453 
6454 				/* Keep a copy of frame_header for Ethernet packets */
6455 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6456 			}
6457 			error = dlil_clat64(ifp, &protocol_family, &m);
6458 			data = (char *) mbuf_data(m);
6459 			if (error != 0) {
6460 				m_freem(m);
6461 				ip6stat.ip6s_clat464_in_drop++;
6462 				goto next;
6463 			}
6464 			/* Native v6 should be No-op */
6465 			if (protocol_family != PF_INET) {
6466 				goto skip_clat;
6467 			}
6468 
6469 			/* Do this only for translated v4 packets. */
6470 			switch (ifp->if_type) {
6471 			case IFT_CELLULAR:
6472 				frame_header = data;
6473 				break;
6474 			case IFT_ETHER:
6475 				/*
6476 				 * Drop if the mbuf doesn't have enough
6477 				 * space for Ethernet header
6478 				 */
6479 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6480 					m_free(m);
6481 					ip6stat.ip6s_clat464_in_drop++;
6482 					goto next;
6483 				}
6484 				/*
6485 				 * Set the frame_header ETHER_HDR_LEN bytes
6486 				 * preceeding the data pointer. Change
6487 				 * the ether_type too.
6488 				 */
6489 				frame_header = data - ETHER_HDR_LEN;
6490 				eh.ether_type = htons(ETHERTYPE_IP);
6491 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6492 				break;
6493 			}
6494 		}
6495 skip_clat:
6496 		/*
6497 		 * Match the wake packet against the list of ports that has been
6498 		 * been queried by the driver before the device went to sleep
6499 		 */
6500 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6501 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6502 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6503 			}
6504 		}
6505 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6506 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6507 			dlil_input_cksum_dbg(ifp, m, frame_header,
6508 			    protocol_family);
6509 		}
6510 		/*
6511 		 * For partial checksum offload, we expect the driver to
6512 		 * set the start offset indicating the start of the span
6513 		 * that is covered by the hardware-computed checksum;
6514 		 * adjust this start offset accordingly because the data
6515 		 * pointer has been advanced beyond the link-layer header.
6516 		 *
6517 		 * Virtual lan types (bridge, vlan, bond) can call
6518 		 * dlil_input_packet_list() with the same packet with the
6519 		 * checksum flags set. Set a flag indicating that the
6520 		 * adjustment has already been done.
6521 		 */
6522 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6523 			/* adjustment has already been done */
6524 		} else if ((m->m_pkthdr.csum_flags &
6525 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6526 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6527 			int adj;
6528 			if (frame_header == NULL ||
6529 			    frame_header < (char *)mbuf_datastart(m) ||
6530 			    frame_header > (char *)m->m_data ||
6531 			    (adj = (int)(m->m_data - frame_header)) >
6532 			    m->m_pkthdr.csum_rx_start) {
6533 				m->m_pkthdr.csum_data = 0;
6534 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6535 				hwcksum_in_invalidated++;
6536 			} else {
6537 				m->m_pkthdr.csum_rx_start -= adj;
6538 			}
6539 			/* make sure we don't adjust more than once */
6540 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6541 		}
6542 		if (clat_debug) {
6543 			pktap_input(ifp, protocol_family, m, frame_header);
6544 		}
6545 
6546 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6547 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6548 		}
6549 
6550 		/* run interface filters */
6551 		error = dlil_interface_filters_input(ifp, &m,
6552 		    &frame_header, protocol_family);
6553 		if (error != 0) {
6554 			if (error != EJUSTRETURN) {
6555 				m_freem(m);
6556 			}
6557 			goto next;
6558 		}
6559 		/*
6560 		 * A VLAN interface receives VLAN-tagged packets by attaching
6561 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6562 		 * interface is a member of a bridge, the parent interface
6563 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6564 		 * M_PROMISC packet must be processed by the VLAN protocol
6565 		 * so that it can be sent up the stack via
6566 		 * dlil_input_packet_list(). That allows the bridge interface's
6567 		 * input filter, attached to the VLAN interface, to process
6568 		 * the packet.
6569 		 */
6570 		if (protocol_family != PF_VLAN &&
6571 		    (m->m_flags & M_PROMISC) != 0) {
6572 			m_freem(m);
6573 			goto next;
6574 		}
6575 
6576 		/* Lookup the protocol attachment to this interface */
6577 		if (protocol_family == 0) {
6578 			ifproto = NULL;
6579 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6580 		    (last_ifproto->protocol_family == protocol_family)) {
6581 			VERIFY(ifproto == NULL);
6582 			ifproto = last_ifproto;
6583 			if_proto_ref(last_ifproto);
6584 		} else {
6585 			VERIFY(ifproto == NULL);
6586 			ifnet_lock_shared(ifp);
6587 			/* callee holds a proto refcnt upon success */
6588 			ifproto = find_attached_proto(ifp, protocol_family);
6589 			ifnet_lock_done(ifp);
6590 		}
6591 		if (ifproto == NULL) {
6592 			/* no protocol for this packet, discard */
6593 			m_freem(m);
6594 			goto next;
6595 		}
6596 		if (ifproto != last_ifproto) {
6597 			if (last_ifproto != NULL) {
6598 				/* pass up the list for the previous protocol */
6599 				dlil_ifproto_input(last_ifproto, pkt_first);
6600 				pkt_first = NULL;
6601 				if_proto_free(last_ifproto);
6602 			}
6603 			last_ifproto = ifproto;
6604 			if_proto_ref(ifproto);
6605 		}
6606 		/* extend the list */
6607 		m->m_pkthdr.pkt_hdr = frame_header;
6608 		if (pkt_first == NULL) {
6609 			pkt_first = m;
6610 		} else {
6611 			*pkt_next = m;
6612 		}
6613 		pkt_next = &m->m_nextpkt;
6614 
6615 next:
6616 		if (next_packet == NULL && last_ifproto != NULL) {
6617 			/* pass up the last list of packets */
6618 			dlil_ifproto_input(last_ifproto, pkt_first);
6619 			if_proto_free(last_ifproto);
6620 			last_ifproto = NULL;
6621 		}
6622 		if (ifproto != NULL) {
6623 			if_proto_free(ifproto);
6624 			ifproto = NULL;
6625 		}
6626 
6627 		m = next_packet;
6628 
6629 		/* update the driver's multicast filter, if needed */
6630 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6631 			ifp->if_updatemcasts = 0;
6632 		}
6633 		if (iorefcnt == 1) {
6634 			/* If the next mbuf is on a different interface, unlock data-mov */
6635 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6636 				ifnet_datamov_end(ifp);
6637 				iorefcnt = 0;
6638 			}
6639 		}
6640 	}
6641 
6642 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6643 }
6644 
6645 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6646 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6647 {
6648 	errno_t err;
6649 
6650 	if (sync) {
6651 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6652 		if (err == EAFNOSUPPORT) {
6653 			err = 0;
6654 		}
6655 	} else {
6656 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6657 		err = 0;
6658 	}
6659 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6660 	    "(err=%d)\n", if_name(ifp),
6661 	    (err == 0 ? "successfully restored" : "failed to restore"),
6662 	    ifp->if_updatemcasts, err);
6663 
6664 	/* just return success */
6665 	return 0;
6666 }
6667 
6668 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6669 if_mcasts_update_async(struct ifnet *ifp)
6670 {
6671 	return if_mcasts_update_common(ifp, false);
6672 }
6673 
6674 errno_t
if_mcasts_update(struct ifnet * ifp)6675 if_mcasts_update(struct ifnet *ifp)
6676 {
6677 	return if_mcasts_update_common(ifp, true);
6678 }
6679 
6680 /* If ifp is set, we will increment the generation for the interface */
6681 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6682 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6683 {
6684 	if (ifp != NULL) {
6685 		ifnet_increment_generation(ifp);
6686 	}
6687 
6688 #if NECP
6689 	necp_update_all_clients();
6690 #endif /* NECP */
6691 
6692 	return kev_post_msg(event);
6693 }
6694 
6695 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6696 dlil_post_sifflags_msg(struct ifnet * ifp)
6697 {
6698 	struct kev_msg ev_msg;
6699 	struct net_event_data ev_data;
6700 
6701 	bzero(&ev_data, sizeof(ev_data));
6702 	bzero(&ev_msg, sizeof(ev_msg));
6703 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6704 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6705 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6706 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6707 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6708 	ev_data.if_family = ifp->if_family;
6709 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6710 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6711 	ev_msg.dv[0].data_ptr = &ev_data;
6712 	ev_msg.dv[1].data_length = 0;
6713 	dlil_post_complete_msg(ifp, &ev_msg);
6714 }
6715 
6716 #define TMP_IF_PROTO_ARR_SIZE   10
6717 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6718 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6719 {
6720 	struct ifnet_filter *filter = NULL;
6721 	struct if_proto *proto = NULL;
6722 	int if_proto_count = 0;
6723 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6724 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6725 	int tmp_ifproto_arr_idx = 0;
6726 
6727 	/*
6728 	 * Pass the event to the interface filters
6729 	 */
6730 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6731 	/* prevent filter list from changing in case we drop the lock */
6732 	if_flt_monitor_busy(ifp);
6733 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6734 		if (filter->filt_event != NULL) {
6735 			lck_mtx_unlock(&ifp->if_flt_lock);
6736 
6737 			filter->filt_event(filter->filt_cookie, ifp,
6738 			    filter->filt_protocol, event);
6739 
6740 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6741 		}
6742 	}
6743 	/* we're done with the filter list */
6744 	if_flt_monitor_unbusy(ifp);
6745 	lck_mtx_unlock(&ifp->if_flt_lock);
6746 
6747 	/* Get an io ref count if the interface is attached */
6748 	if (!ifnet_is_attached(ifp, 1)) {
6749 		goto done;
6750 	}
6751 
6752 	/*
6753 	 * An embedded tmp_list_entry in if_proto may still get
6754 	 * over-written by another thread after giving up ifnet lock,
6755 	 * therefore we are avoiding embedded pointers here.
6756 	 */
6757 	ifnet_lock_shared(ifp);
6758 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6759 	if (if_proto_count) {
6760 		int i;
6761 		VERIFY(ifp->if_proto_hash != NULL);
6762 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6763 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6764 		} else {
6765 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6766 			    if_proto_count, Z_WAITOK | Z_ZERO);
6767 			if (tmp_ifproto_arr == NULL) {
6768 				ifnet_lock_done(ifp);
6769 				goto cleanup;
6770 			}
6771 		}
6772 
6773 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6774 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6775 			    next_hash) {
6776 				if_proto_ref(proto);
6777 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6778 				tmp_ifproto_arr_idx++;
6779 			}
6780 		}
6781 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6782 	}
6783 	ifnet_lock_done(ifp);
6784 
6785 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6786 	    tmp_ifproto_arr_idx++) {
6787 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6788 		VERIFY(proto != NULL);
6789 		proto_media_event eventp =
6790 		    (proto->proto_kpi == kProtoKPI_v1 ?
6791 		    proto->kpi.v1.event :
6792 		    proto->kpi.v2.event);
6793 
6794 		if (eventp != NULL) {
6795 			eventp(ifp, proto->protocol_family,
6796 			    event);
6797 		}
6798 		if_proto_free(proto);
6799 	}
6800 
6801 cleanup:
6802 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6803 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6804 	}
6805 
6806 	/* Pass the event to the interface */
6807 	if (ifp->if_event != NULL) {
6808 		ifp->if_event(ifp, event);
6809 	}
6810 
6811 	/* Release the io ref count */
6812 	ifnet_decr_iorefcnt(ifp);
6813 done:
6814 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6815 }
6816 
6817 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6818 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6819 {
6820 	struct kev_msg kev_msg;
6821 	int result = 0;
6822 
6823 	if (ifp == NULL || event == NULL) {
6824 		return EINVAL;
6825 	}
6826 
6827 	bzero(&kev_msg, sizeof(kev_msg));
6828 	kev_msg.vendor_code = event->vendor_code;
6829 	kev_msg.kev_class = event->kev_class;
6830 	kev_msg.kev_subclass = event->kev_subclass;
6831 	kev_msg.event_code = event->event_code;
6832 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6833 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6834 	kev_msg.dv[1].data_length = 0;
6835 
6836 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6837 
6838 	return result;
6839 }
6840 
6841 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6842 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6843 {
6844 	mbuf_t  n = m;
6845 	int chainlen = 0;
6846 
6847 	while (n != NULL) {
6848 		chainlen++;
6849 		n = n->m_next;
6850 	}
6851 	switch (chainlen) {
6852 	case 0:
6853 		break;
6854 	case 1:
6855 		os_atomic_inc(&cls->cls_one, relaxed);
6856 		break;
6857 	case 2:
6858 		os_atomic_inc(&cls->cls_two, relaxed);
6859 		break;
6860 	case 3:
6861 		os_atomic_inc(&cls->cls_three, relaxed);
6862 		break;
6863 	case 4:
6864 		os_atomic_inc(&cls->cls_four, relaxed);
6865 		break;
6866 	case 5:
6867 	default:
6868 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6869 		break;
6870 	}
6871 }
6872 
6873 #if CONFIG_DTRACE
6874 __attribute__((noinline))
6875 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6876 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6877 {
6878 	if (proto_family == PF_INET) {
6879 		struct ip *ip = mtod(m, struct ip *);
6880 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6881 		    struct ip *, ip, struct ifnet *, ifp,
6882 		    struct ip *, ip, struct ip6_hdr *, NULL);
6883 	} else if (proto_family == PF_INET6) {
6884 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6885 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6886 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6887 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6888 	}
6889 }
6890 #endif /* CONFIG_DTRACE */
6891 
6892 /*
6893  * dlil_output
6894  *
6895  * Caller should have a lock on the protocol domain if the protocol
6896  * doesn't support finer grained locking. In most cases, the lock
6897  * will be held from the socket layer and won't be released until
6898  * we return back to the socket layer.
6899  *
6900  * This does mean that we must take a protocol lock before we take
6901  * an interface lock if we're going to take both. This makes sense
6902  * because a protocol is likely to interact with an ifp while it
6903  * is under the protocol lock.
6904  *
6905  * An advisory code will be returned if adv is not null. This
6906  * can be used to provide feedback about interface queues to the
6907  * application.
6908  */
6909 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6910 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6911     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6912 {
6913 	char *frame_type = NULL;
6914 	char *dst_linkaddr = NULL;
6915 	int retval = 0;
6916 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6917 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6918 	struct if_proto *proto = NULL;
6919 	mbuf_t  m = NULL;
6920 	mbuf_t  send_head = NULL;
6921 	mbuf_t  *send_tail = &send_head;
6922 	int iorefcnt = 0;
6923 	u_int32_t pre = 0, post = 0;
6924 	u_int32_t fpkts = 0, fbytes = 0;
6925 	int32_t flen = 0;
6926 	struct timespec now;
6927 	u_int64_t now_nsec;
6928 	boolean_t did_clat46 = FALSE;
6929 	protocol_family_t old_proto_family = proto_family;
6930 	struct sockaddr_in6 dest6;
6931 	struct rtentry *rt = NULL;
6932 	u_int32_t m_loop_set = 0;
6933 
6934 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6935 
6936 	/*
6937 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6938 	 * from happening while this operation is in progress
6939 	 */
6940 	if (!ifnet_datamov_begin(ifp)) {
6941 		retval = ENXIO;
6942 		goto cleanup;
6943 	}
6944 	iorefcnt = 1;
6945 
6946 	VERIFY(ifp->if_output_dlil != NULL);
6947 
6948 	/* update the driver's multicast filter, if needed */
6949 	if (ifp->if_updatemcasts > 0) {
6950 		if_mcasts_update_async(ifp);
6951 		ifp->if_updatemcasts = 0;
6952 	}
6953 
6954 	frame_type = frame_type_buffer;
6955 	dst_linkaddr = dst_linkaddr_buffer;
6956 
6957 	if (raw == 0) {
6958 		ifnet_lock_shared(ifp);
6959 		/* callee holds a proto refcnt upon success */
6960 		proto = find_attached_proto(ifp, proto_family);
6961 		if (proto == NULL) {
6962 			ifnet_lock_done(ifp);
6963 			retval = ENXIO;
6964 			goto cleanup;
6965 		}
6966 		ifnet_lock_done(ifp);
6967 	}
6968 
6969 preout_again:
6970 	if (packetlist == NULL) {
6971 		goto cleanup;
6972 	}
6973 
6974 	m = packetlist;
6975 	packetlist = packetlist->m_nextpkt;
6976 	m->m_nextpkt = NULL;
6977 
6978 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6979 
6980 	/*
6981 	 * Perform address family translation for the first
6982 	 * packet outside the loop in order to perform address
6983 	 * lookup for the translated proto family.
6984 	 */
6985 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6986 	    (ifp->if_type == IFT_CELLULAR ||
6987 	    dlil_is_clat_needed(proto_family, m))) {
6988 		retval = dlil_clat46(ifp, &proto_family, &m);
6989 		/*
6990 		 * Go to the next packet if translation fails
6991 		 */
6992 		if (retval != 0) {
6993 			m_freem(m);
6994 			m = NULL;
6995 			ip6stat.ip6s_clat464_out_drop++;
6996 			/* Make sure that the proto family is PF_INET */
6997 			ASSERT(proto_family == PF_INET);
6998 			goto preout_again;
6999 		}
7000 		/*
7001 		 * Free the old one and make it point to the IPv6 proto structure.
7002 		 *
7003 		 * Change proto for the first time we have successfully
7004 		 * performed address family translation.
7005 		 */
7006 		if (!did_clat46 && proto_family == PF_INET6) {
7007 			did_clat46 = TRUE;
7008 
7009 			if (proto != NULL) {
7010 				if_proto_free(proto);
7011 			}
7012 			ifnet_lock_shared(ifp);
7013 			/* callee holds a proto refcnt upon success */
7014 			proto = find_attached_proto(ifp, proto_family);
7015 			if (proto == NULL) {
7016 				ifnet_lock_done(ifp);
7017 				retval = ENXIO;
7018 				m_freem(m);
7019 				m = NULL;
7020 				goto cleanup;
7021 			}
7022 			ifnet_lock_done(ifp);
7023 			if (ifp->if_type == IFT_ETHER) {
7024 				/* Update the dest to translated v6 address */
7025 				dest6.sin6_len = sizeof(struct sockaddr_in6);
7026 				dest6.sin6_family = AF_INET6;
7027 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7028 				dest = (const struct sockaddr *)&dest6;
7029 
7030 				/*
7031 				 * Lookup route to the translated destination
7032 				 * Free this route ref during cleanup
7033 				 */
7034 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7035 				    0, 0, ifp->if_index);
7036 
7037 				route = rt;
7038 			}
7039 		}
7040 	}
7041 
7042 	/*
7043 	 * This path gets packet chain going to the same destination.
7044 	 * The pre output routine is used to either trigger resolution of
7045 	 * the next hop or retreive the next hop's link layer addressing.
7046 	 * For ex: ether_inet(6)_pre_output routine.
7047 	 *
7048 	 * If the routine returns EJUSTRETURN, it implies that packet has
7049 	 * been queued, and therefore we have to call preout_again for the
7050 	 * following packet in the chain.
7051 	 *
7052 	 * For errors other than EJUSTRETURN, the current packet is freed
7053 	 * and the rest of the chain (pointed by packetlist is freed as
7054 	 * part of clean up.
7055 	 *
7056 	 * Else if there is no error the retrieved information is used for
7057 	 * all the packets in the chain.
7058 	 */
7059 	if (raw == 0) {
7060 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7061 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7062 		retval = 0;
7063 		if (preoutp != NULL) {
7064 			retval = preoutp(ifp, proto_family, &m, dest, route,
7065 			    frame_type, dst_linkaddr);
7066 
7067 			if (retval != 0) {
7068 				if (retval == EJUSTRETURN) {
7069 					goto preout_again;
7070 				}
7071 				m_freem(m);
7072 				m = NULL;
7073 				goto cleanup;
7074 			}
7075 		}
7076 	}
7077 
7078 	do {
7079 		/*
7080 		 * pkt_hdr is set here to point to m_data prior to
7081 		 * calling into the framer. This value of pkt_hdr is
7082 		 * used by the netif gso logic to retrieve the ip header
7083 		 * for the TCP packets, offloaded for TSO processing.
7084 		 */
7085 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7086 			uint8_t vlan_encap_len = 0;
7087 
7088 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7089 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7090 			}
7091 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7092 		} else {
7093 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
7094 		}
7095 
7096 		/*
7097 		 * Perform address family translation if needed.
7098 		 * For now we only support stateless 4 to 6 translation
7099 		 * on the out path.
7100 		 *
7101 		 * The routine below translates IP header, updates protocol
7102 		 * checksum and also translates ICMP.
7103 		 *
7104 		 * We skip the first packet as it is already translated and
7105 		 * the proto family is set to PF_INET6.
7106 		 */
7107 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7108 		    (ifp->if_type == IFT_CELLULAR ||
7109 		    dlil_is_clat_needed(proto_family, m))) {
7110 			retval = dlil_clat46(ifp, &proto_family, &m);
7111 			/* Goto the next packet if the translation fails */
7112 			if (retval != 0) {
7113 				m_freem(m);
7114 				m = NULL;
7115 				ip6stat.ip6s_clat464_out_drop++;
7116 				goto next;
7117 			}
7118 		}
7119 
7120 #if CONFIG_DTRACE
7121 		if (!raw) {
7122 			dlil_output_dtrace(ifp, proto_family, m);
7123 		}
7124 #endif /* CONFIG_DTRACE */
7125 
7126 		if (raw == 0 && ifp->if_framer != NULL) {
7127 			int rcvif_set = 0;
7128 
7129 			/*
7130 			 * If this is a broadcast packet that needs to be
7131 			 * looped back into the system, set the inbound ifp
7132 			 * to that of the outbound ifp.  This will allow
7133 			 * us to determine that it is a legitimate packet
7134 			 * for the system.  Only set the ifp if it's not
7135 			 * already set, just to be safe.
7136 			 */
7137 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7138 			    m->m_pkthdr.rcvif == NULL) {
7139 				m->m_pkthdr.rcvif = ifp;
7140 				rcvif_set = 1;
7141 			}
7142 			m_loop_set = m->m_flags & M_LOOP;
7143 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7144 			    frame_type, &pre, &post);
7145 			if (retval != 0) {
7146 				if (retval != EJUSTRETURN) {
7147 					m_freem(m);
7148 				}
7149 				goto next;
7150 			}
7151 
7152 			/*
7153 			 * For partial checksum offload, adjust the start
7154 			 * and stuff offsets based on the prepended header.
7155 			 */
7156 			if ((m->m_pkthdr.csum_flags &
7157 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7158 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7159 				m->m_pkthdr.csum_tx_stuff += pre;
7160 				m->m_pkthdr.csum_tx_start += pre;
7161 			}
7162 
7163 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7164 				dlil_output_cksum_dbg(ifp, m, pre,
7165 				    proto_family);
7166 			}
7167 
7168 			/*
7169 			 * Clear the ifp if it was set above, and to be
7170 			 * safe, only if it is still the same as the
7171 			 * outbound ifp we have in context.  If it was
7172 			 * looped back, then a copy of it was sent to the
7173 			 * loopback interface with the rcvif set, and we
7174 			 * are clearing the one that will go down to the
7175 			 * layer below.
7176 			 */
7177 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7178 				m->m_pkthdr.rcvif = NULL;
7179 			}
7180 		}
7181 
7182 		/*
7183 		 * Let interface filters (if any) do their thing ...
7184 		 */
7185 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
7186 		if (retval != 0) {
7187 			if (retval != EJUSTRETURN) {
7188 				m_freem(m);
7189 			}
7190 			goto next;
7191 		}
7192 		/*
7193 		 * Strip away M_PROTO1 bit prior to sending packet
7194 		 * to the driver as this field may be used by the driver
7195 		 */
7196 		m->m_flags &= ~M_PROTO1;
7197 
7198 		/*
7199 		 * If the underlying interface is not capable of handling a
7200 		 * packet whose data portion spans across physically disjoint
7201 		 * pages, we need to "normalize" the packet so that we pass
7202 		 * down a chain of mbufs where each mbuf points to a span that
7203 		 * resides in the system page boundary.  If the packet does
7204 		 * not cross page(s), the following is a no-op.
7205 		 */
7206 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7207 			if ((m = m_normalize(m)) == NULL) {
7208 				goto next;
7209 			}
7210 		}
7211 
7212 		/*
7213 		 * If this is a TSO packet, make sure the interface still
7214 		 * advertise TSO capability.
7215 		 */
7216 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7217 			retval = EMSGSIZE;
7218 			m_freem(m);
7219 			goto cleanup;
7220 		}
7221 
7222 		ifp_inc_traffic_class_out(ifp, m);
7223 
7224 #if SKYWALK
7225 		/*
7226 		 * For native skywalk devices, packets will be passed to pktap
7227 		 * after GSO or after the mbuf to packet conversion.
7228 		 * This is done for IPv4/IPv6 packets only because there is no
7229 		 * space in the mbuf to pass down the proto family.
7230 		 */
7231 		if (dlil_is_native_netif_nexus(ifp)) {
7232 			if (raw || m->m_pkthdr.pkt_proto == 0) {
7233 				pktap_output(ifp, proto_family, m, pre, post);
7234 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7235 			}
7236 		} else {
7237 			pktap_output(ifp, proto_family, m, pre, post);
7238 		}
7239 #else /* SKYWALK */
7240 		pktap_output(ifp, proto_family, m, pre, post);
7241 #endif /* SKYWALK */
7242 
7243 		/*
7244 		 * Count the number of elements in the mbuf chain
7245 		 */
7246 		if (tx_chain_len_count) {
7247 			dlil_count_chain_len(m, &tx_chain_len_stats);
7248 		}
7249 
7250 		/*
7251 		 * Record timestamp; ifnet_enqueue() will use this info
7252 		 * rather than redoing the work.  An optimization could
7253 		 * involve doing this just once at the top, if there are
7254 		 * no interface filters attached, but that's probably
7255 		 * not a big deal.
7256 		 */
7257 		nanouptime(&now);
7258 		net_timernsec(&now, &now_nsec);
7259 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
7260 
7261 		/*
7262 		 * Discard partial sum information if this packet originated
7263 		 * from another interface; the packet would already have the
7264 		 * final checksum and we shouldn't recompute it.
7265 		 */
7266 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7267 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7268 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7269 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7270 			m->m_pkthdr.csum_data = 0;
7271 		}
7272 
7273 		/*
7274 		 * Finally, call the driver.
7275 		 */
7276 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7277 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7278 				flen += (m_pktlen(m) - (pre + post));
7279 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7280 			}
7281 			*send_tail = m;
7282 			send_tail = &m->m_nextpkt;
7283 		} else {
7284 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7285 				flen = (m_pktlen(m) - (pre + post));
7286 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7287 			} else {
7288 				flen = 0;
7289 			}
7290 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7291 			    0, 0, 0, 0, 0);
7292 			retval = (*ifp->if_output_dlil)(ifp, m);
7293 			if (retval == EQFULL || retval == EQSUSPENDED) {
7294 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7295 					adv->code = (retval == EQFULL ?
7296 					    FADV_FLOW_CONTROLLED :
7297 					    FADV_SUSPENDED);
7298 				}
7299 				retval = 0;
7300 			}
7301 			if (retval == 0 && flen > 0) {
7302 				fbytes += flen;
7303 				fpkts++;
7304 			}
7305 			if (retval != 0 && dlil_verbose) {
7306 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7307 				    __func__, if_name(ifp),
7308 				    retval);
7309 			}
7310 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7311 			    0, 0, 0, 0, 0);
7312 		}
7313 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7314 
7315 next:
7316 		m = packetlist;
7317 		if (m != NULL) {
7318 			m->m_flags |= m_loop_set;
7319 			packetlist = packetlist->m_nextpkt;
7320 			m->m_nextpkt = NULL;
7321 		}
7322 		/* Reset the proto family to old proto family for CLAT */
7323 		if (did_clat46) {
7324 			proto_family = old_proto_family;
7325 		}
7326 	} while (m != NULL);
7327 
7328 	if (send_head != NULL) {
7329 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7330 		    0, 0, 0, 0, 0);
7331 		if (ifp->if_eflags & IFEF_SENDLIST) {
7332 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7333 			if (retval == EQFULL || retval == EQSUSPENDED) {
7334 				if (adv != NULL) {
7335 					adv->code = (retval == EQFULL ?
7336 					    FADV_FLOW_CONTROLLED :
7337 					    FADV_SUSPENDED);
7338 				}
7339 				retval = 0;
7340 			}
7341 			if (retval == 0 && flen > 0) {
7342 				fbytes += flen;
7343 				fpkts++;
7344 			}
7345 			if (retval != 0 && dlil_verbose) {
7346 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7347 				    __func__, if_name(ifp), retval);
7348 			}
7349 		} else {
7350 			struct mbuf *send_m;
7351 			int enq_cnt = 0;
7352 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7353 			while (send_head != NULL) {
7354 				send_m = send_head;
7355 				send_head = send_m->m_nextpkt;
7356 				send_m->m_nextpkt = NULL;
7357 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7358 				if (retval == EQFULL || retval == EQSUSPENDED) {
7359 					if (adv != NULL) {
7360 						adv->code = (retval == EQFULL ?
7361 						    FADV_FLOW_CONTROLLED :
7362 						    FADV_SUSPENDED);
7363 					}
7364 					retval = 0;
7365 				}
7366 				if (retval == 0) {
7367 					enq_cnt++;
7368 					if (flen > 0) {
7369 						fpkts++;
7370 					}
7371 				}
7372 				if (retval != 0 && dlil_verbose) {
7373 					DLIL_PRINTF("%s: output error on %s "
7374 					    "retval = %d\n",
7375 					    __func__, if_name(ifp), retval);
7376 				}
7377 			}
7378 			if (enq_cnt > 0) {
7379 				fbytes += flen;
7380 				ifnet_start(ifp);
7381 			}
7382 		}
7383 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7384 	}
7385 
7386 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7387 
7388 cleanup:
7389 	if (fbytes > 0) {
7390 		ifp->if_fbytes += fbytes;
7391 	}
7392 	if (fpkts > 0) {
7393 		ifp->if_fpackets += fpkts;
7394 	}
7395 	if (proto != NULL) {
7396 		if_proto_free(proto);
7397 	}
7398 	if (packetlist) { /* if any packets are left, clean up */
7399 		mbuf_freem_list(packetlist);
7400 	}
7401 	if (retval == EJUSTRETURN) {
7402 		retval = 0;
7403 	}
7404 	if (iorefcnt == 1) {
7405 		ifnet_datamov_end(ifp);
7406 	}
7407 	if (rt != NULL) {
7408 		rtfree(rt);
7409 		rt = NULL;
7410 	}
7411 
7412 	return retval;
7413 }
7414 
7415 /*
7416  * This routine checks if the destination address is not a loopback, link-local,
7417  * multicast or broadcast address.
7418  */
7419 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7420 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7421 {
7422 	int ret = 0;
7423 	switch (proto_family) {
7424 	case PF_INET: {
7425 		struct ip *iph = mtod(m, struct ip *);
7426 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7427 			ret = 1;
7428 		}
7429 		break;
7430 	}
7431 	case PF_INET6: {
7432 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7433 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7434 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7435 			ret = 1;
7436 		}
7437 		break;
7438 	}
7439 	}
7440 
7441 	return ret;
7442 }
7443 /*
7444  * @brief This routine translates IPv4 packet to IPv6 packet,
7445  *     updates protocol checksum and also translates ICMP for code
7446  *     along with inner header translation.
7447  *
7448  * @param ifp Pointer to the interface
7449  * @param proto_family pointer to protocol family. It is updated if function
7450  *     performs the translation successfully.
7451  * @param m Pointer to the pointer pointing to the packet. Needed because this
7452  *     routine can end up changing the mbuf to a different one.
7453  *
7454  * @return 0 on success or else a negative value.
7455  */
7456 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7457 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7458 {
7459 	VERIFY(*proto_family == PF_INET);
7460 	VERIFY(IS_INTF_CLAT46(ifp));
7461 
7462 	pbuf_t pbuf_store, *pbuf = NULL;
7463 	struct ip *iph = NULL;
7464 	struct in_addr osrc, odst;
7465 	uint8_t proto = 0;
7466 	struct in6_ifaddr *ia6_clat_src = NULL;
7467 	struct in6_addr *src = NULL;
7468 	struct in6_addr dst;
7469 	int error = 0;
7470 	uint16_t off = 0;
7471 	uint16_t tot_len = 0;
7472 	uint16_t ip_id_val = 0;
7473 	uint16_t ip_frag_off = 0;
7474 
7475 	boolean_t is_frag = FALSE;
7476 	boolean_t is_first_frag = TRUE;
7477 	boolean_t is_last_frag = TRUE;
7478 
7479 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7480 	pbuf = &pbuf_store;
7481 	iph = pbuf->pb_data;
7482 
7483 	osrc = iph->ip_src;
7484 	odst = iph->ip_dst;
7485 	proto = iph->ip_p;
7486 	off = (uint16_t)(iph->ip_hl << 2);
7487 	ip_id_val = iph->ip_id;
7488 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7489 
7490 	tot_len = ntohs(iph->ip_len);
7491 
7492 	/*
7493 	 * For packets that are not first frags
7494 	 * we only need to adjust CSUM.
7495 	 * For 4 to 6, Fragmentation header gets appended
7496 	 * after proto translation.
7497 	 */
7498 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7499 		is_frag = TRUE;
7500 
7501 		/* If the offset is not zero, it is not first frag */
7502 		if (ip_frag_off != 0) {
7503 			is_first_frag = FALSE;
7504 		}
7505 
7506 		/* If IP_MF is set, then it is not last frag */
7507 		if (ntohs(iph->ip_off) & IP_MF) {
7508 			is_last_frag = FALSE;
7509 		}
7510 	}
7511 
7512 	/*
7513 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7514 	 * translation.
7515 	 */
7516 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7517 	if (ia6_clat_src == NULL) {
7518 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7519 		error = -1;
7520 		goto cleanup;
7521 	}
7522 
7523 	src = &ia6_clat_src->ia_addr.sin6_addr;
7524 
7525 	/*
7526 	 * Translate IPv4 destination to IPv6 destination by using the
7527 	 * prefixes learned through prior PLAT discovery.
7528 	 */
7529 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7530 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7531 		goto cleanup;
7532 	}
7533 
7534 	/* Translate the IP header part first */
7535 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7536 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7537 
7538 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7539 
7540 	if (error != 0) {
7541 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7542 		goto cleanup;
7543 	}
7544 
7545 	/*
7546 	 * Translate protocol header, update checksum, checksum flags
7547 	 * and related fields.
7548 	 */
7549 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7550 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7551 
7552 	if (error != 0) {
7553 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7554 		goto cleanup;
7555 	}
7556 
7557 	/* Now insert the IPv6 fragment header */
7558 	if (is_frag) {
7559 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7560 
7561 		if (error != 0) {
7562 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7563 			goto cleanup;
7564 		}
7565 	}
7566 
7567 cleanup:
7568 	if (ia6_clat_src != NULL) {
7569 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7570 	}
7571 
7572 	if (pbuf_is_valid(pbuf)) {
7573 		*m = pbuf->pb_mbuf;
7574 		pbuf->pb_mbuf = NULL;
7575 		pbuf_destroy(pbuf);
7576 	} else {
7577 		error = -1;
7578 		*m = NULL;
7579 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7580 	}
7581 
7582 	if (error == 0) {
7583 		*proto_family = PF_INET6;
7584 		ip6stat.ip6s_clat464_out_success++;
7585 	}
7586 
7587 	return error;
7588 }
7589 
7590 /*
7591  * @brief This routine translates incoming IPv6 to IPv4 packet,
7592  *     updates protocol checksum and also translates ICMPv6 outer
7593  *     and inner headers
7594  *
7595  * @return 0 on success or else a negative value.
7596  */
7597 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7598 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7599 {
7600 	VERIFY(*proto_family == PF_INET6);
7601 	VERIFY(IS_INTF_CLAT46(ifp));
7602 
7603 	struct ip6_hdr *ip6h = NULL;
7604 	struct in6_addr osrc, odst;
7605 	uint8_t proto = 0;
7606 	struct in6_ifaddr *ia6_clat_dst = NULL;
7607 	struct in_ifaddr *ia4_clat_dst = NULL;
7608 	struct in_addr *dst = NULL;
7609 	struct in_addr src;
7610 	int error = 0;
7611 	uint32_t off = 0;
7612 	u_int64_t tot_len = 0;
7613 	uint8_t tos = 0;
7614 	boolean_t is_first_frag = TRUE;
7615 
7616 	/* Incoming mbuf does not contain valid IP6 header */
7617 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7618 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7619 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7620 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7621 		return -1;
7622 	}
7623 
7624 	ip6h = mtod(*m, struct ip6_hdr *);
7625 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7626 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7627 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7628 		return -1;
7629 	}
7630 
7631 	osrc = ip6h->ip6_src;
7632 	odst = ip6h->ip6_dst;
7633 
7634 	/*
7635 	 * Retrieve the local CLAT46 reserved IPv6 address.
7636 	 * Let the packet pass if we don't find one, as the flag
7637 	 * may get set before IPv6 configuration has taken place.
7638 	 */
7639 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7640 	if (ia6_clat_dst == NULL) {
7641 		goto done;
7642 	}
7643 
7644 	/*
7645 	 * Check if the original dest in the packet is same as the reserved
7646 	 * CLAT46 IPv6 address
7647 	 */
7648 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7649 		pbuf_t pbuf_store, *pbuf = NULL;
7650 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7651 		pbuf = &pbuf_store;
7652 
7653 		/*
7654 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7655 		 * translation.
7656 		 */
7657 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7658 		if (ia4_clat_dst == NULL) {
7659 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7660 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7661 			error = -1;
7662 			goto cleanup;
7663 		}
7664 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7665 
7666 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7667 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7668 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7669 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7670 			error = -1;
7671 			goto cleanup;
7672 		}
7673 
7674 		ip6h = pbuf->pb_data;
7675 		off = sizeof(struct ip6_hdr);
7676 		proto = ip6h->ip6_nxt;
7677 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7678 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7679 
7680 		/*
7681 		 * Translate the IP header and update the fragmentation
7682 		 * header if needed
7683 		 */
7684 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7685 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7686 		    0 : -1;
7687 
7688 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7689 
7690 		if (error != 0) {
7691 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7692 			goto cleanup;
7693 		}
7694 
7695 		/*
7696 		 * Translate protocol header, update checksum, checksum flags
7697 		 * and related fields.
7698 		 */
7699 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7700 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7701 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7702 
7703 		if (error != 0) {
7704 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7705 			goto cleanup;
7706 		}
7707 
7708 cleanup:
7709 		if (ia4_clat_dst != NULL) {
7710 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7711 		}
7712 
7713 		if (pbuf_is_valid(pbuf)) {
7714 			*m = pbuf->pb_mbuf;
7715 			pbuf->pb_mbuf = NULL;
7716 			pbuf_destroy(pbuf);
7717 		} else {
7718 			error = -1;
7719 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7720 		}
7721 
7722 		if (error == 0) {
7723 			*proto_family = PF_INET;
7724 			ip6stat.ip6s_clat464_in_success++;
7725 		}
7726 	} /* CLAT traffic */
7727 
7728 done:
7729 	return error;
7730 }
7731 
7732 /* The following is used to enqueue work items for ifnet ioctl events */
7733 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7734 
7735 struct ifnet_ioctl_event {
7736 	struct ifnet *ifp;
7737 	u_long ioctl_code;
7738 };
7739 
7740 struct ifnet_ioctl_event_nwk_wq_entry {
7741 	struct nwk_wq_entry nwk_wqe;
7742 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7743 };
7744 
7745 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7746 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7747 {
7748 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7749 	bool compare_expected;
7750 
7751 	/*
7752 	 * Get an io ref count if the interface is attached.
7753 	 * At this point it most likely is. We are taking a reference for
7754 	 * deferred processing.
7755 	 */
7756 	if (!ifnet_is_attached(ifp, 1)) {
7757 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7758 		    "is not attached",
7759 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7760 		return;
7761 	}
7762 	switch (ioctl_code) {
7763 	case SIOCADDMULTI:
7764 		compare_expected = false;
7765 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7766 			ifnet_decr_iorefcnt(ifp);
7767 			return;
7768 		}
7769 		break;
7770 	case SIOCDELMULTI:
7771 		compare_expected = false;
7772 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7773 			ifnet_decr_iorefcnt(ifp);
7774 			return;
7775 		}
7776 		break;
7777 	default:
7778 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7779 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7780 		return;
7781 	}
7782 
7783 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7784 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7785 
7786 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7787 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7788 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7789 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7790 }
7791 
7792 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7793 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7794 {
7795 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7796 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7797 
7798 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7799 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7800 	int ret = 0;
7801 
7802 	switch (ioctl_code) {
7803 	case SIOCADDMULTI:
7804 		atomic_store(&ifp->if_mcast_add_signaled, false);
7805 		break;
7806 	case SIOCDELMULTI:
7807 		atomic_store(&ifp->if_mcast_del_signaled, false);
7808 		break;
7809 	}
7810 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7811 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7812 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7813 	} else if (dlil_verbose) {
7814 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7815 		    "for ioctl %lu",
7816 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7817 	}
7818 	ifnet_decr_iorefcnt(ifp);
7819 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7820 	return;
7821 }
7822 
7823 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7824 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7825     void *ioctl_arg)
7826 {
7827 	struct ifnet_filter *filter;
7828 	int retval = EOPNOTSUPP;
7829 	int result = 0;
7830 
7831 	if (ifp == NULL || ioctl_code == 0) {
7832 		return EINVAL;
7833 	}
7834 
7835 	/* Get an io ref count if the interface is attached */
7836 	if (!ifnet_is_attached(ifp, 1)) {
7837 		return EOPNOTSUPP;
7838 	}
7839 
7840 	/*
7841 	 * Run the interface filters first.
7842 	 * We want to run all filters before calling the protocol,
7843 	 * interface family, or interface.
7844 	 */
7845 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7846 	/* prevent filter list from changing in case we drop the lock */
7847 	if_flt_monitor_busy(ifp);
7848 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7849 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7850 		    filter->filt_protocol == proto_fam)) {
7851 			lck_mtx_unlock(&ifp->if_flt_lock);
7852 
7853 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7854 			    proto_fam, ioctl_code, ioctl_arg);
7855 
7856 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7857 
7858 			/* Only update retval if no one has handled the ioctl */
7859 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7860 				if (result == ENOTSUP) {
7861 					result = EOPNOTSUPP;
7862 				}
7863 				retval = result;
7864 				if (retval != 0 && retval != EOPNOTSUPP) {
7865 					/* we're done with the filter list */
7866 					if_flt_monitor_unbusy(ifp);
7867 					lck_mtx_unlock(&ifp->if_flt_lock);
7868 					goto cleanup;
7869 				}
7870 			}
7871 		}
7872 	}
7873 	/* we're done with the filter list */
7874 	if_flt_monitor_unbusy(ifp);
7875 	lck_mtx_unlock(&ifp->if_flt_lock);
7876 
7877 	/* Allow the protocol to handle the ioctl */
7878 	if (proto_fam != 0) {
7879 		struct if_proto *proto;
7880 
7881 		/* callee holds a proto refcnt upon success */
7882 		ifnet_lock_shared(ifp);
7883 		proto = find_attached_proto(ifp, proto_fam);
7884 		ifnet_lock_done(ifp);
7885 		if (proto != NULL) {
7886 			proto_media_ioctl ioctlp =
7887 			    (proto->proto_kpi == kProtoKPI_v1 ?
7888 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7889 			result = EOPNOTSUPP;
7890 			if (ioctlp != NULL) {
7891 				result = ioctlp(ifp, proto_fam, ioctl_code,
7892 				    ioctl_arg);
7893 			}
7894 			if_proto_free(proto);
7895 
7896 			/* Only update retval if no one has handled the ioctl */
7897 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7898 				if (result == ENOTSUP) {
7899 					result = EOPNOTSUPP;
7900 				}
7901 				retval = result;
7902 				if (retval && retval != EOPNOTSUPP) {
7903 					goto cleanup;
7904 				}
7905 			}
7906 		}
7907 	}
7908 
7909 	/* retval is either 0 or EOPNOTSUPP */
7910 
7911 	/*
7912 	 * Let the interface handle this ioctl.
7913 	 * If it returns EOPNOTSUPP, ignore that, we may have
7914 	 * already handled this in the protocol or family.
7915 	 */
7916 	if (ifp->if_ioctl) {
7917 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7918 	}
7919 
7920 	/* Only update retval if no one has handled the ioctl */
7921 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7922 		if (result == ENOTSUP) {
7923 			result = EOPNOTSUPP;
7924 		}
7925 		retval = result;
7926 		if (retval && retval != EOPNOTSUPP) {
7927 			goto cleanup;
7928 		}
7929 	}
7930 
7931 cleanup:
7932 	if (retval == EJUSTRETURN) {
7933 		retval = 0;
7934 	}
7935 
7936 	ifnet_decr_iorefcnt(ifp);
7937 
7938 	return retval;
7939 }
7940 
7941 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7942 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7943 {
7944 	errno_t error = 0;
7945 
7946 	if (ifp->if_set_bpf_tap) {
7947 		/* Get an io reference on the interface if it is attached */
7948 		if (!ifnet_is_attached(ifp, 1)) {
7949 			return ENXIO;
7950 		}
7951 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7952 		ifnet_decr_iorefcnt(ifp);
7953 	}
7954 	return error;
7955 }
7956 
7957 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7958 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7959     struct sockaddr *ll_addr, size_t ll_len)
7960 {
7961 	errno_t result = EOPNOTSUPP;
7962 	struct if_proto *proto;
7963 	const struct sockaddr *verify;
7964 	proto_media_resolve_multi resolvep;
7965 
7966 	if (!ifnet_is_attached(ifp, 1)) {
7967 		return result;
7968 	}
7969 
7970 	bzero(ll_addr, ll_len);
7971 
7972 	/* Call the protocol first; callee holds a proto refcnt upon success */
7973 	ifnet_lock_shared(ifp);
7974 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7975 	ifnet_lock_done(ifp);
7976 	if (proto != NULL) {
7977 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7978 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7979 		if (resolvep != NULL) {
7980 			result = resolvep(ifp, proto_addr,
7981 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7982 		}
7983 		if_proto_free(proto);
7984 	}
7985 
7986 	/* Let the interface verify the multicast address */
7987 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7988 		if (result == 0) {
7989 			verify = ll_addr;
7990 		} else {
7991 			verify = proto_addr;
7992 		}
7993 		result = ifp->if_check_multi(ifp, verify);
7994 	}
7995 
7996 	ifnet_decr_iorefcnt(ifp);
7997 	return result;
7998 }
7999 
8000 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8001 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8002     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8003     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8004 {
8005 	struct if_proto *proto;
8006 	errno_t result = 0;
8007 
8008 	if ((ifp->if_flags & IFF_NOARP) != 0) {
8009 		result = ENOTSUP;
8010 		goto done;
8011 	}
8012 
8013 	/* callee holds a proto refcnt upon success */
8014 	ifnet_lock_shared(ifp);
8015 	proto = find_attached_proto(ifp, target_proto->sa_family);
8016 	ifnet_lock_done(ifp);
8017 	if (proto == NULL) {
8018 		result = ENOTSUP;
8019 	} else {
8020 		proto_media_send_arp    arpp;
8021 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8022 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8023 		if (arpp == NULL) {
8024 			result = ENOTSUP;
8025 		} else {
8026 			switch (arpop) {
8027 			case ARPOP_REQUEST:
8028 				arpstat.txrequests++;
8029 				if (target_hw != NULL) {
8030 					arpstat.txurequests++;
8031 				}
8032 				break;
8033 			case ARPOP_REPLY:
8034 				arpstat.txreplies++;
8035 				break;
8036 			}
8037 			result = arpp(ifp, arpop, sender_hw, sender_proto,
8038 			    target_hw, target_proto);
8039 		}
8040 		if_proto_free(proto);
8041 	}
8042 done:
8043 	return result;
8044 }
8045 
8046 struct net_thread_marks { };
8047 static const struct net_thread_marks net_thread_marks_base = { };
8048 
8049 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8050     &net_thread_marks_base;
8051 
8052 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8053 net_thread_marks_push(u_int32_t push)
8054 {
8055 	static const char *const base = (const void*)&net_thread_marks_base;
8056 	u_int32_t pop = 0;
8057 
8058 	if (push != 0) {
8059 		struct uthread *uth = current_uthread();
8060 
8061 		pop = push & ~uth->uu_network_marks;
8062 		if (pop != 0) {
8063 			uth->uu_network_marks |= pop;
8064 		}
8065 	}
8066 
8067 	return (net_thread_marks_t)&base[pop];
8068 }
8069 
8070 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8071 net_thread_unmarks_push(u_int32_t unpush)
8072 {
8073 	static const char *const base = (const void*)&net_thread_marks_base;
8074 	u_int32_t unpop = 0;
8075 
8076 	if (unpush != 0) {
8077 		struct uthread *uth = current_uthread();
8078 
8079 		unpop = unpush & uth->uu_network_marks;
8080 		if (unpop != 0) {
8081 			uth->uu_network_marks &= ~unpop;
8082 		}
8083 	}
8084 
8085 	return (net_thread_marks_t)&base[unpop];
8086 }
8087 
8088 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8089 net_thread_marks_pop(net_thread_marks_t popx)
8090 {
8091 	static const char *const base = (const void*)&net_thread_marks_base;
8092 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
8093 
8094 	if (pop != 0) {
8095 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8096 		struct uthread *uth = current_uthread();
8097 
8098 		VERIFY((pop & ones) == pop);
8099 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8100 		uth->uu_network_marks &= ~pop;
8101 	}
8102 }
8103 
8104 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8105 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8106 {
8107 	static const char *const base = (const void*)&net_thread_marks_base;
8108 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8109 
8110 	if (unpop != 0) {
8111 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8112 		struct uthread *uth = current_uthread();
8113 
8114 		VERIFY((unpop & ones) == unpop);
8115 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8116 		uth->uu_network_marks |= unpop;
8117 	}
8118 }
8119 
8120 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8121 net_thread_is_marked(u_int32_t check)
8122 {
8123 	if (check != 0) {
8124 		struct uthread *uth = current_uthread();
8125 		return uth->uu_network_marks & check;
8126 	} else {
8127 		return 0;
8128 	}
8129 }
8130 
8131 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8132 net_thread_is_unmarked(u_int32_t check)
8133 {
8134 	if (check != 0) {
8135 		struct uthread *uth = current_uthread();
8136 		return ~uth->uu_network_marks & check;
8137 	} else {
8138 		return 0;
8139 	}
8140 }
8141 
8142 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8143 _is_announcement(const struct sockaddr_in * sender_sin,
8144     const struct sockaddr_in * target_sin)
8145 {
8146 	if (target_sin == NULL || sender_sin == NULL) {
8147 		return FALSE;
8148 	}
8149 
8150 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8151 }
8152 
8153 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8154 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8155     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8156     const struct sockaddr *target_proto0, u_int32_t rtflags)
8157 {
8158 	errno_t result = 0;
8159 	const struct sockaddr_in * sender_sin;
8160 	const struct sockaddr_in * target_sin;
8161 	struct sockaddr_inarp target_proto_sinarp;
8162 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8163 
8164 	if (target_proto == NULL || sender_proto == NULL) {
8165 		return EINVAL;
8166 	}
8167 
8168 	if (sender_proto->sa_family != target_proto->sa_family) {
8169 		return EINVAL;
8170 	}
8171 
8172 	/*
8173 	 * If the target is a (default) router, provide that
8174 	 * information to the send_arp callback routine.
8175 	 */
8176 	if (rtflags & RTF_ROUTER) {
8177 		bcopy(target_proto, &target_proto_sinarp,
8178 		    sizeof(struct sockaddr_in));
8179 		target_proto_sinarp.sin_other |= SIN_ROUTER;
8180 		target_proto = (struct sockaddr *)&target_proto_sinarp;
8181 	}
8182 
8183 	/*
8184 	 * If this is an ARP request and the target IP is IPv4LL,
8185 	 * send the request on all interfaces.  The exception is
8186 	 * an announcement, which must only appear on the specific
8187 	 * interface.
8188 	 */
8189 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8190 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8191 	if (target_proto->sa_family == AF_INET &&
8192 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8193 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8194 	    !_is_announcement(sender_sin, target_sin)) {
8195 		ifnet_t         *ifp_list;
8196 		u_int32_t       count;
8197 		u_int32_t       ifp_on;
8198 
8199 		result = ENOTSUP;
8200 
8201 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8202 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
8203 				errno_t new_result;
8204 				ifaddr_t source_hw = NULL;
8205 				ifaddr_t source_ip = NULL;
8206 				struct sockaddr_in source_ip_copy;
8207 				struct ifnet *cur_ifp = ifp_list[ifp_on];
8208 
8209 				/*
8210 				 * Only arp on interfaces marked for IPv4LL
8211 				 * ARPing.  This may mean that we don't ARP on
8212 				 * the interface the subnet route points to.
8213 				 */
8214 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8215 					continue;
8216 				}
8217 
8218 				/* Find the source IP address */
8219 				ifnet_lock_shared(cur_ifp);
8220 				source_hw = cur_ifp->if_lladdr;
8221 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8222 				    ifa_link) {
8223 					IFA_LOCK(source_ip);
8224 					if (source_ip->ifa_addr != NULL &&
8225 					    source_ip->ifa_addr->sa_family ==
8226 					    AF_INET) {
8227 						/* Copy the source IP address */
8228 						source_ip_copy =
8229 						    *(struct sockaddr_in *)
8230 						    (void *)source_ip->ifa_addr;
8231 						IFA_UNLOCK(source_ip);
8232 						break;
8233 					}
8234 					IFA_UNLOCK(source_ip);
8235 				}
8236 
8237 				/* No IP Source, don't arp */
8238 				if (source_ip == NULL) {
8239 					ifnet_lock_done(cur_ifp);
8240 					continue;
8241 				}
8242 
8243 				IFA_ADDREF(source_hw);
8244 				ifnet_lock_done(cur_ifp);
8245 
8246 				/* Send the ARP */
8247 				new_result = dlil_send_arp_internal(cur_ifp,
8248 				    arpop, (struct sockaddr_dl *)(void *)
8249 				    source_hw->ifa_addr,
8250 				    (struct sockaddr *)&source_ip_copy, NULL,
8251 				    target_proto);
8252 
8253 				IFA_REMREF(source_hw);
8254 				if (result == ENOTSUP) {
8255 					result = new_result;
8256 				}
8257 			}
8258 			ifnet_list_free(ifp_list);
8259 		}
8260 	} else {
8261 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8262 		    sender_proto, target_hw, target_proto);
8263 	}
8264 
8265 	return result;
8266 }
8267 
8268 /*
8269  * Caller must hold ifnet head lock.
8270  */
8271 static int
ifnet_lookup(struct ifnet * ifp)8272 ifnet_lookup(struct ifnet *ifp)
8273 {
8274 	struct ifnet *_ifp;
8275 
8276 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8277 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8278 		if (_ifp == ifp) {
8279 			break;
8280 		}
8281 	}
8282 	return _ifp != NULL;
8283 }
8284 
8285 /*
8286  * Caller has to pass a non-zero refio argument to get a
8287  * IO reference count. This will prevent ifnet_detach from
8288  * being called when there are outstanding io reference counts.
8289  */
8290 int
ifnet_is_attached(struct ifnet * ifp,int refio)8291 ifnet_is_attached(struct ifnet *ifp, int refio)
8292 {
8293 	int ret;
8294 
8295 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8296 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
8297 		if (refio > 0) {
8298 			ifp->if_refio++;
8299 		}
8300 	}
8301 	lck_mtx_unlock(&ifp->if_ref_lock);
8302 
8303 	return ret;
8304 }
8305 
8306 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8307 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8308 {
8309 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8310 	ifp->if_threads_pending++;
8311 	lck_mtx_unlock(&ifp->if_ref_lock);
8312 }
8313 
8314 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8315 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8316 {
8317 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8318 	VERIFY(ifp->if_threads_pending > 0);
8319 	ifp->if_threads_pending--;
8320 	if (ifp->if_threads_pending == 0) {
8321 		wakeup(&ifp->if_threads_pending);
8322 	}
8323 	lck_mtx_unlock(&ifp->if_ref_lock);
8324 }
8325 
8326 /*
8327  * Caller must ensure the interface is attached; the assumption is that
8328  * there is at least an outstanding IO reference count held already.
8329  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8330  */
8331 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8332 ifnet_incr_iorefcnt(struct ifnet *ifp)
8333 {
8334 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8335 	VERIFY(IF_FULLY_ATTACHED(ifp));
8336 	VERIFY(ifp->if_refio > 0);
8337 	ifp->if_refio++;
8338 	lck_mtx_unlock(&ifp->if_ref_lock);
8339 }
8340 
8341 __attribute__((always_inline))
8342 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8343 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8344 {
8345 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8346 
8347 	VERIFY(ifp->if_refio > 0);
8348 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8349 
8350 	ifp->if_refio--;
8351 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8352 
8353 	/*
8354 	 * if there are no more outstanding io references, wakeup the
8355 	 * ifnet_detach thread if detaching flag is set.
8356 	 */
8357 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8358 		wakeup(&(ifp->if_refio));
8359 	}
8360 }
8361 
8362 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8363 ifnet_decr_iorefcnt(struct ifnet *ifp)
8364 {
8365 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8366 	ifnet_decr_iorefcnt_locked(ifp);
8367 	lck_mtx_unlock(&ifp->if_ref_lock);
8368 }
8369 
8370 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8371 ifnet_datamov_begin(struct ifnet *ifp)
8372 {
8373 	boolean_t ret;
8374 
8375 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8376 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8377 		ifp->if_refio++;
8378 		ifp->if_datamov++;
8379 	}
8380 	lck_mtx_unlock(&ifp->if_ref_lock);
8381 
8382 	return ret;
8383 }
8384 
8385 void
ifnet_datamov_end(struct ifnet * ifp)8386 ifnet_datamov_end(struct ifnet *ifp)
8387 {
8388 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8389 	VERIFY(ifp->if_datamov > 0);
8390 	/*
8391 	 * if there's no more thread moving data, wakeup any
8392 	 * drainers that's blocked waiting for this.
8393 	 */
8394 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8395 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8396 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8397 		wakeup(&(ifp->if_datamov));
8398 	}
8399 	ifnet_decr_iorefcnt_locked(ifp);
8400 	lck_mtx_unlock(&ifp->if_ref_lock);
8401 }
8402 
8403 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8404 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8405 {
8406 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8407 	ifp->if_refio++;
8408 	if (ifp->if_suspend++ == 0) {
8409 		VERIFY(ifp->if_refflags & IFRF_READY);
8410 		ifp->if_refflags &= ~IFRF_READY;
8411 	}
8412 }
8413 
8414 void
ifnet_datamov_suspend(struct ifnet * ifp)8415 ifnet_datamov_suspend(struct ifnet *ifp)
8416 {
8417 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8418 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8419 	ifnet_datamov_suspend_locked(ifp);
8420 	lck_mtx_unlock(&ifp->if_ref_lock);
8421 }
8422 
8423 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8424 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8425 {
8426 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8427 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8428 	if (ifp->if_suspend > 0) {
8429 		lck_mtx_unlock(&ifp->if_ref_lock);
8430 		return FALSE;
8431 	}
8432 	ifnet_datamov_suspend_locked(ifp);
8433 	lck_mtx_unlock(&ifp->if_ref_lock);
8434 	return TRUE;
8435 }
8436 
8437 void
ifnet_datamov_drain(struct ifnet * ifp)8438 ifnet_datamov_drain(struct ifnet *ifp)
8439 {
8440 	lck_mtx_lock(&ifp->if_ref_lock);
8441 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8442 	/* data movement must already be suspended */
8443 	VERIFY(ifp->if_suspend > 0);
8444 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8445 	ifp->if_drainers++;
8446 	while (ifp->if_datamov != 0) {
8447 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8448 		    if_name(ifp));
8449 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8450 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8451 		    (PZERO - 1), __func__, NULL);
8452 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8453 	}
8454 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8455 	VERIFY(ifp->if_drainers > 0);
8456 	ifp->if_drainers--;
8457 	lck_mtx_unlock(&ifp->if_ref_lock);
8458 
8459 	/* purge the interface queues */
8460 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8461 		if_qflush_snd(ifp, false);
8462 	}
8463 }
8464 
8465 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8466 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8467 {
8468 	ifnet_datamov_suspend(ifp);
8469 	ifnet_datamov_drain(ifp);
8470 }
8471 
8472 void
ifnet_datamov_resume(struct ifnet * ifp)8473 ifnet_datamov_resume(struct ifnet *ifp)
8474 {
8475 	lck_mtx_lock(&ifp->if_ref_lock);
8476 	/* data movement must already be suspended */
8477 	VERIFY(ifp->if_suspend > 0);
8478 	if (--ifp->if_suspend == 0) {
8479 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8480 		ifp->if_refflags |= IFRF_READY;
8481 	}
8482 	ifnet_decr_iorefcnt_locked(ifp);
8483 	lck_mtx_unlock(&ifp->if_ref_lock);
8484 }
8485 
8486 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8487 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8488 {
8489 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8490 	ctrace_t *tr;
8491 	u_int32_t idx;
8492 	u_int16_t *cnt;
8493 
8494 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8495 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8496 		/* NOTREACHED */
8497 	}
8498 
8499 	if (refhold) {
8500 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8501 		tr = dl_if_dbg->dldbg_if_refhold;
8502 	} else {
8503 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8504 		tr = dl_if_dbg->dldbg_if_refrele;
8505 	}
8506 
8507 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8508 	ctrace_record(&tr[idx]);
8509 }
8510 
8511 errno_t
dlil_if_ref(struct ifnet * ifp)8512 dlil_if_ref(struct ifnet *ifp)
8513 {
8514 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8515 
8516 	if (dl_if == NULL) {
8517 		return EINVAL;
8518 	}
8519 
8520 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8521 	++dl_if->dl_if_refcnt;
8522 	if (dl_if->dl_if_refcnt == 0) {
8523 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8524 		/* NOTREACHED */
8525 	}
8526 	if (dl_if->dl_if_trace != NULL) {
8527 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8528 	}
8529 	lck_mtx_unlock(&dl_if->dl_if_lock);
8530 
8531 	return 0;
8532 }
8533 
8534 errno_t
dlil_if_free(struct ifnet * ifp)8535 dlil_if_free(struct ifnet *ifp)
8536 {
8537 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8538 	bool need_release = FALSE;
8539 
8540 	if (dl_if == NULL) {
8541 		return EINVAL;
8542 	}
8543 
8544 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8545 	switch (dl_if->dl_if_refcnt) {
8546 	case 0:
8547 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8548 		/* NOTREACHED */
8549 		break;
8550 	case 1:
8551 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8552 			need_release = TRUE;
8553 		}
8554 		break;
8555 	default:
8556 		break;
8557 	}
8558 	--dl_if->dl_if_refcnt;
8559 	if (dl_if->dl_if_trace != NULL) {
8560 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8561 	}
8562 	lck_mtx_unlock(&dl_if->dl_if_lock);
8563 	if (need_release) {
8564 		_dlil_if_release(ifp, true);
8565 	}
8566 	return 0;
8567 }
8568 
8569 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8570 dlil_attach_protocol(struct if_proto *proto,
8571     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8572     uint32_t * proto_count)
8573 {
8574 	struct kev_dl_proto_data ev_pr_data;
8575 	struct ifnet *ifp = proto->ifp;
8576 	errno_t retval = 0;
8577 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8578 	struct if_proto *prev_proto;
8579 	struct if_proto *_proto;
8580 
8581 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8582 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8583 		return EINVAL;
8584 	}
8585 
8586 	if (!ifnet_is_attached(ifp, 1)) {
8587 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8588 		    __func__, if_name(ifp));
8589 		return ENXIO;
8590 	}
8591 	/* callee holds a proto refcnt upon success */
8592 	ifnet_lock_exclusive(ifp);
8593 	_proto = find_attached_proto(ifp, proto->protocol_family);
8594 	if (_proto != NULL) {
8595 		ifnet_lock_done(ifp);
8596 		if_proto_free(_proto);
8597 		retval = EEXIST;
8598 		goto ioref_done;
8599 	}
8600 
8601 	/*
8602 	 * Call family module add_proto routine so it can refine the
8603 	 * demux descriptors as it wishes.
8604 	 */
8605 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8606 	    demux_count);
8607 	if (retval) {
8608 		ifnet_lock_done(ifp);
8609 		goto ioref_done;
8610 	}
8611 
8612 	/*
8613 	 * Insert the protocol in the hash
8614 	 */
8615 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8616 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8617 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8618 	}
8619 	if (prev_proto) {
8620 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8621 	} else {
8622 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8623 		    proto, next_hash);
8624 	}
8625 
8626 	/* hold a proto refcnt for attach */
8627 	if_proto_ref(proto);
8628 
8629 	/*
8630 	 * The reserved field carries the number of protocol still attached
8631 	 * (subject to change)
8632 	 */
8633 	ev_pr_data.proto_family = proto->protocol_family;
8634 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8635 
8636 	ifnet_lock_done(ifp);
8637 
8638 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8639 	    (struct net_event_data *)&ev_pr_data,
8640 	    sizeof(struct kev_dl_proto_data), FALSE);
8641 	if (proto_count != NULL) {
8642 		*proto_count = ev_pr_data.proto_remaining_count;
8643 	}
8644 ioref_done:
8645 	ifnet_decr_iorefcnt(ifp);
8646 	return retval;
8647 }
8648 
8649 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8650 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8651 {
8652 	/*
8653 	 * A protocol has been attached, mark the interface up.
8654 	 * This used to be done by configd.KernelEventMonitor, but that
8655 	 * is inherently prone to races (rdar://problem/30810208).
8656 	 */
8657 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8658 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8659 	dlil_post_sifflags_msg(ifp);
8660 #if SKYWALK
8661 	switch (protocol) {
8662 	case AF_INET:
8663 	case AF_INET6:
8664 		/* don't attach the flowswitch unless attaching IP */
8665 		dlil_attach_flowswitch_nexus(ifp);
8666 		break;
8667 	default:
8668 		break;
8669 	}
8670 #endif /* SKYWALK */
8671 }
8672 
8673 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8674 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8675     const struct ifnet_attach_proto_param *proto_details)
8676 {
8677 	int retval = 0;
8678 	struct if_proto  *ifproto = NULL;
8679 	uint32_t proto_count = 0;
8680 
8681 	ifnet_head_lock_shared();
8682 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8683 		retval = EINVAL;
8684 		goto end;
8685 	}
8686 	/* Check that the interface is in the global list */
8687 	if (!ifnet_lookup(ifp)) {
8688 		retval = ENXIO;
8689 		goto end;
8690 	}
8691 
8692 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8693 
8694 	/* refcnt held above during lookup */
8695 	ifproto->ifp = ifp;
8696 	ifproto->protocol_family = protocol;
8697 	ifproto->proto_kpi = kProtoKPI_v1;
8698 	ifproto->kpi.v1.input = proto_details->input;
8699 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8700 	ifproto->kpi.v1.event = proto_details->event;
8701 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8702 	ifproto->kpi.v1.detached = proto_details->detached;
8703 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8704 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8705 
8706 	retval = dlil_attach_protocol(ifproto,
8707 	    proto_details->demux_list, proto_details->demux_count,
8708 	    &proto_count);
8709 
8710 end:
8711 	if (retval == EEXIST) {
8712 		/* already attached */
8713 		if (dlil_verbose) {
8714 			DLIL_PRINTF("%s: protocol %d already attached\n",
8715 			    ifp != NULL ? if_name(ifp) : "N/A",
8716 			    protocol);
8717 		}
8718 	} else if (retval != 0) {
8719 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8720 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8721 	} else if (dlil_verbose) {
8722 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8723 		    ifp != NULL ? if_name(ifp) : "N/A",
8724 		    protocol, proto_count);
8725 	}
8726 	ifnet_head_done();
8727 	if (retval == 0) {
8728 		dlil_handle_proto_attach(ifp, protocol);
8729 	} else if (ifproto != NULL) {
8730 		zfree(dlif_proto_zone, ifproto);
8731 	}
8732 	return retval;
8733 }
8734 
8735 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8736 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8737     const struct ifnet_attach_proto_param_v2 *proto_details)
8738 {
8739 	int retval = 0;
8740 	struct if_proto  *ifproto = NULL;
8741 	uint32_t proto_count = 0;
8742 
8743 	ifnet_head_lock_shared();
8744 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8745 		retval = EINVAL;
8746 		goto end;
8747 	}
8748 	/* Check that the interface is in the global list */
8749 	if (!ifnet_lookup(ifp)) {
8750 		retval = ENXIO;
8751 		goto end;
8752 	}
8753 
8754 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8755 
8756 	/* refcnt held above during lookup */
8757 	ifproto->ifp = ifp;
8758 	ifproto->protocol_family = protocol;
8759 	ifproto->proto_kpi = kProtoKPI_v2;
8760 	ifproto->kpi.v2.input = proto_details->input;
8761 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8762 	ifproto->kpi.v2.event = proto_details->event;
8763 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8764 	ifproto->kpi.v2.detached = proto_details->detached;
8765 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8766 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8767 
8768 	retval = dlil_attach_protocol(ifproto,
8769 	    proto_details->demux_list, proto_details->demux_count,
8770 	    &proto_count);
8771 
8772 end:
8773 	if (retval == EEXIST) {
8774 		/* already attached */
8775 		if (dlil_verbose) {
8776 			DLIL_PRINTF("%s: protocol %d already attached\n",
8777 			    ifp != NULL ? if_name(ifp) : "N/A",
8778 			    protocol);
8779 		}
8780 	} else if (retval != 0) {
8781 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8782 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8783 	} else if (dlil_verbose) {
8784 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8785 		    ifp != NULL ? if_name(ifp) : "N/A",
8786 		    protocol, proto_count);
8787 	}
8788 	ifnet_head_done();
8789 	if (retval == 0) {
8790 		dlil_handle_proto_attach(ifp, protocol);
8791 	} else if (ifproto != NULL) {
8792 		zfree(dlif_proto_zone, ifproto);
8793 	}
8794 	return retval;
8795 }
8796 
8797 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8798 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8799 {
8800 	struct if_proto *proto = NULL;
8801 	int     retval = 0;
8802 
8803 	if (ifp == NULL || proto_family == 0) {
8804 		retval = EINVAL;
8805 		goto end;
8806 	}
8807 
8808 	ifnet_lock_exclusive(ifp);
8809 	/* callee holds a proto refcnt upon success */
8810 	proto = find_attached_proto(ifp, proto_family);
8811 	if (proto == NULL) {
8812 		retval = ENXIO;
8813 		ifnet_lock_done(ifp);
8814 		goto end;
8815 	}
8816 
8817 	/* call family module del_proto */
8818 	if (ifp->if_del_proto) {
8819 		ifp->if_del_proto(ifp, proto->protocol_family);
8820 	}
8821 
8822 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8823 	    proto, if_proto, next_hash);
8824 
8825 	if (proto->proto_kpi == kProtoKPI_v1) {
8826 		proto->kpi.v1.input = ifproto_media_input_v1;
8827 		proto->kpi.v1.pre_output = ifproto_media_preout;
8828 		proto->kpi.v1.event = ifproto_media_event;
8829 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8830 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8831 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8832 	} else {
8833 		proto->kpi.v2.input = ifproto_media_input_v2;
8834 		proto->kpi.v2.pre_output = ifproto_media_preout;
8835 		proto->kpi.v2.event = ifproto_media_event;
8836 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8837 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8838 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8839 	}
8840 	proto->detached = 1;
8841 	ifnet_lock_done(ifp);
8842 
8843 	if (dlil_verbose) {
8844 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8845 		    (proto->proto_kpi == kProtoKPI_v1) ?
8846 		    "v1" : "v2", proto_family);
8847 	}
8848 
8849 	/* release proto refcnt held during protocol attach */
8850 	if_proto_free(proto);
8851 
8852 	/*
8853 	 * Release proto refcnt held during lookup; the rest of
8854 	 * protocol detach steps will happen when the last proto
8855 	 * reference is released.
8856 	 */
8857 	if_proto_free(proto);
8858 
8859 end:
8860 	return retval;
8861 }
8862 
8863 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8864 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8865     struct mbuf *packet, char *header)
8866 {
8867 #pragma unused(ifp, protocol, packet, header)
8868 	return ENXIO;
8869 }
8870 
8871 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8872 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8873     struct mbuf *packet)
8874 {
8875 #pragma unused(ifp, protocol, packet)
8876 	return ENXIO;
8877 }
8878 
8879 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8880 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8881     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8882     char *link_layer_dest)
8883 {
8884 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8885 	return ENXIO;
8886 }
8887 
8888 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8889 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8890     const struct kev_msg *event)
8891 {
8892 #pragma unused(ifp, protocol, event)
8893 }
8894 
8895 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8896 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8897     unsigned long command, void *argument)
8898 {
8899 #pragma unused(ifp, protocol, command, argument)
8900 	return ENXIO;
8901 }
8902 
8903 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8904 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8905     struct sockaddr_dl *out_ll, size_t ll_len)
8906 {
8907 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8908 	return ENXIO;
8909 }
8910 
8911 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8912 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8913     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8914     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8915 {
8916 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8917 	return ENXIO;
8918 }
8919 
8920 extern int if_next_index(void);
8921 extern int tcp_ecn_outbound;
8922 
8923 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8924 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8925 {
8926 	uint32_t sflags = 0;
8927 	int err;
8928 
8929 	if (if_flowadv) {
8930 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8931 	}
8932 
8933 	if (if_delaybased_queue) {
8934 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8935 	}
8936 
8937 	if (ifp->if_output_sched_model ==
8938 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8939 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8940 	}
8941 	/* Inherit drop limit from the default queue */
8942 	if (ifp->if_snd != ifcq) {
8943 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8944 	}
8945 	/* Initialize transmit queue(s) */
8946 	err = ifclassq_setup(ifcq, ifp, sflags);
8947 	if (err != 0) {
8948 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8949 		    "err=%d", __func__, ifp, err);
8950 		/* NOTREACHED */
8951 	}
8952 }
8953 
8954 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8955 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8956 {
8957 #if SKYWALK
8958 	boolean_t netif_compat;
8959 	if_nexus_netif  nexus_netif;
8960 #endif /* SKYWALK */
8961 	struct ifnet *tmp_if;
8962 	struct ifaddr *ifa;
8963 	struct if_data_internal if_data_saved;
8964 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8965 	struct dlil_threading_info *dl_inp;
8966 	thread_continue_t thfunc = NULL;
8967 	int err;
8968 
8969 	if (ifp == NULL) {
8970 		return EINVAL;
8971 	}
8972 
8973 	/*
8974 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8975 	 * prevent the interface from being configured while it is
8976 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8977 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8978 	 */
8979 	dlil_if_lock();
8980 	ifnet_head_lock_exclusive();
8981 	/* Verify we aren't already on the list */
8982 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8983 		if (tmp_if == ifp) {
8984 			ifnet_head_done();
8985 			dlil_if_unlock();
8986 			return EEXIST;
8987 		}
8988 	}
8989 
8990 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8991 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8992 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8993 		    __func__, ifp);
8994 		/* NOTREACHED */
8995 	}
8996 	lck_mtx_unlock(&ifp->if_ref_lock);
8997 
8998 	ifnet_lock_exclusive(ifp);
8999 
9000 	/* Sanity check */
9001 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9002 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9003 	VERIFY(ifp->if_threads_pending == 0);
9004 
9005 	if (ll_addr != NULL) {
9006 		if (ifp->if_addrlen == 0) {
9007 			ifp->if_addrlen = ll_addr->sdl_alen;
9008 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9009 			ifnet_lock_done(ifp);
9010 			ifnet_head_done();
9011 			dlil_if_unlock();
9012 			return EINVAL;
9013 		}
9014 	}
9015 
9016 	/*
9017 	 * Allow interfaces without protocol families to attach
9018 	 * only if they have the necessary fields filled out.
9019 	 */
9020 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9021 		DLIL_PRINTF("%s: Attempt to attach interface without "
9022 		    "family module - %d\n", __func__, ifp->if_family);
9023 		ifnet_lock_done(ifp);
9024 		ifnet_head_done();
9025 		dlil_if_unlock();
9026 		return ENODEV;
9027 	}
9028 
9029 	/* Allocate protocol hash table */
9030 	VERIFY(ifp->if_proto_hash == NULL);
9031 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9032 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9033 
9034 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9035 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9036 	TAILQ_INIT(&ifp->if_flt_head);
9037 	VERIFY(ifp->if_flt_busy == 0);
9038 	VERIFY(ifp->if_flt_waiters == 0);
9039 	VERIFY(ifp->if_flt_non_os_count == 0);
9040 	VERIFY(ifp->if_flt_no_tso_count == 0);
9041 	lck_mtx_unlock(&ifp->if_flt_lock);
9042 
9043 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9044 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9045 		LIST_INIT(&ifp->if_multiaddrs);
9046 	}
9047 
9048 	VERIFY(ifp->if_allhostsinm == NULL);
9049 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9050 	TAILQ_INIT(&ifp->if_addrhead);
9051 
9052 	if (ifp->if_index == 0) {
9053 		int idx = if_next_index();
9054 
9055 		/*
9056 		 * Since we exhausted the list of
9057 		 * if_index's, try to find an empty slot
9058 		 * in ifindex2ifnet.
9059 		 */
9060 		if (idx == -1 && if_index >= UINT16_MAX) {
9061 			for (int i = 1; i < if_index; i++) {
9062 				if (ifindex2ifnet[i] == NULL &&
9063 				    ifnet_addrs[i - 1] == NULL) {
9064 					idx = i;
9065 					break;
9066 				}
9067 			}
9068 		}
9069 		if (idx == -1) {
9070 			ifp->if_index = 0;
9071 			ifnet_lock_done(ifp);
9072 			ifnet_head_done();
9073 			dlil_if_unlock();
9074 			return ENOBUFS;
9075 		}
9076 		ifp->if_index = (uint16_t)idx;
9077 
9078 		/* the lladdr passed at attach time is the permanent address */
9079 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9080 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9081 			bcopy(CONST_LLADDR(ll_addr),
9082 			    dl_if->dl_if_permanent_ether,
9083 			    ETHER_ADDR_LEN);
9084 			dl_if->dl_if_permanent_ether_is_set = 1;
9085 		}
9086 	}
9087 	/* There should not be anything occupying this slot */
9088 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9089 
9090 	/* allocate (if needed) and initialize a link address */
9091 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
9092 	if (ifa == NULL) {
9093 		ifnet_lock_done(ifp);
9094 		ifnet_head_done();
9095 		dlil_if_unlock();
9096 		return ENOBUFS;
9097 	}
9098 
9099 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9100 	ifnet_addrs[ifp->if_index - 1] = ifa;
9101 
9102 	/* make this address the first on the list */
9103 	IFA_LOCK(ifa);
9104 	/* hold a reference for ifnet_addrs[] */
9105 	IFA_ADDREF_LOCKED(ifa);
9106 	/* if_attach_link_ifa() holds a reference for ifa_link */
9107 	if_attach_link_ifa(ifp, ifa);
9108 	IFA_UNLOCK(ifa);
9109 
9110 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9111 	ifindex2ifnet[ifp->if_index] = ifp;
9112 
9113 	/* Hold a reference to the underlying dlil_ifnet */
9114 	ifnet_reference(ifp);
9115 
9116 	/* Clear stats (save and restore other fields that we care) */
9117 	if_data_saved = ifp->if_data;
9118 	bzero(&ifp->if_data, sizeof(ifp->if_data));
9119 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
9120 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9121 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9122 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9123 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9124 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9125 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9126 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9127 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9128 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9129 	ifnet_touch_lastchange(ifp);
9130 
9131 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9132 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9133 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9134 
9135 	dlil_ifclassq_setup(ifp, ifp->if_snd);
9136 
9137 	/* Sanity checks on the input thread storage */
9138 	dl_inp = &dl_if->dl_if_inpstorage;
9139 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9140 	VERIFY(dl_inp->dlth_flags == 0);
9141 	VERIFY(dl_inp->dlth_wtot == 0);
9142 	VERIFY(dl_inp->dlth_ifp == NULL);
9143 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9144 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9145 	VERIFY(!dl_inp->dlth_affinity);
9146 	VERIFY(ifp->if_inp == NULL);
9147 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9148 	VERIFY(dl_inp->dlth_strategy == NULL);
9149 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9150 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9151 	VERIFY(dl_inp->dlth_affinity_tag == 0);
9152 
9153 #if IFNET_INPUT_SANITY_CHK
9154 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
9155 #endif /* IFNET_INPUT_SANITY_CHK */
9156 
9157 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9158 	dlil_reset_rxpoll_params(ifp);
9159 	/*
9160 	 * A specific DLIL input thread is created per non-loopback interface.
9161 	 */
9162 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9163 		ifp->if_inp = dl_inp;
9164 		ifnet_incr_pending_thread_count(ifp);
9165 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9166 		if (err == ENODEV) {
9167 			VERIFY(thfunc == NULL);
9168 			ifnet_decr_pending_thread_count(ifp);
9169 		} else if (err != 0) {
9170 			panic_plain("%s: ifp=%p couldn't get an input thread; "
9171 			    "err=%d", __func__, ifp, err);
9172 			/* NOTREACHED */
9173 		}
9174 	}
9175 	/*
9176 	 * If the driver supports the new transmit model, calculate flow hash
9177 	 * and create a workloop starter thread to invoke the if_start callback
9178 	 * where the packets may be dequeued and transmitted.
9179 	 */
9180 	if (ifp->if_eflags & IFEF_TXSTART) {
9181 		thread_precedence_policy_data_t info;
9182 		__unused kern_return_t kret;
9183 
9184 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9185 		VERIFY(ifp->if_flowhash != 0);
9186 		VERIFY(ifp->if_start_thread == THREAD_NULL);
9187 
9188 		ifnet_set_start_cycle(ifp, NULL);
9189 		ifp->if_start_pacemaker_time = 0;
9190 		ifp->if_start_active = 0;
9191 		ifp->if_start_req = 0;
9192 		ifp->if_start_flags = 0;
9193 		VERIFY(ifp->if_start != NULL);
9194 		ifnet_incr_pending_thread_count(ifp);
9195 		if ((err = kernel_thread_start(ifnet_start_thread_func,
9196 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9197 			panic_plain("%s: "
9198 			    "ifp=%p couldn't get a start thread; "
9199 			    "err=%d", __func__, ifp, err);
9200 			/* NOTREACHED */
9201 		}
9202 		bzero(&info, sizeof(info));
9203 		info.importance = 1;
9204 		kret = thread_policy_set(ifp->if_start_thread,
9205 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9206 		    THREAD_PRECEDENCE_POLICY_COUNT);
9207 		ASSERT(kret == KERN_SUCCESS);
9208 	} else {
9209 		ifp->if_flowhash = 0;
9210 	}
9211 
9212 	/* Reset polling parameters */
9213 	ifnet_set_poll_cycle(ifp, NULL);
9214 	ifp->if_poll_update = 0;
9215 	ifp->if_poll_flags = 0;
9216 	ifp->if_poll_req = 0;
9217 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9218 
9219 	/*
9220 	 * If the driver supports the new receive model, create a poller
9221 	 * thread to invoke if_input_poll callback where the packets may
9222 	 * be dequeued from the driver and processed for reception.
9223 	 * if the interface is netif compat then the poller thread is
9224 	 * managed by netif.
9225 	 */
9226 	if (thfunc == dlil_rxpoll_input_thread_func) {
9227 		thread_precedence_policy_data_t info;
9228 		__unused kern_return_t kret;
9229 #if SKYWALK
9230 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9231 #endif /* SKYWALK */
9232 		VERIFY(ifp->if_input_poll != NULL);
9233 		VERIFY(ifp->if_input_ctl != NULL);
9234 		ifnet_incr_pending_thread_count(ifp);
9235 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9236 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
9237 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
9238 			    "err=%d", __func__, ifp, err);
9239 			/* NOTREACHED */
9240 		}
9241 		bzero(&info, sizeof(info));
9242 		info.importance = 1;
9243 		kret = thread_policy_set(ifp->if_poll_thread,
9244 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9245 		    THREAD_PRECEDENCE_POLICY_COUNT);
9246 		ASSERT(kret == KERN_SUCCESS);
9247 	}
9248 
9249 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9250 	VERIFY(ifp->if_desc.ifd_len == 0);
9251 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9252 
9253 	/* Record attach PC stacktrace */
9254 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9255 
9256 	ifp->if_updatemcasts = 0;
9257 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9258 		struct ifmultiaddr *ifma;
9259 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9260 			IFMA_LOCK(ifma);
9261 			if (ifma->ifma_addr->sa_family == AF_LINK ||
9262 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
9263 				ifp->if_updatemcasts++;
9264 			}
9265 			IFMA_UNLOCK(ifma);
9266 		}
9267 
9268 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9269 		    "membership(s)\n", if_name(ifp),
9270 		    ifp->if_updatemcasts);
9271 	}
9272 
9273 	/* Clear logging parameters */
9274 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9275 
9276 	/* Clear foreground/realtime activity timestamps */
9277 	ifp->if_fg_sendts = 0;
9278 	ifp->if_rt_sendts = 0;
9279 
9280 	/* Clear throughput estimates and radio type */
9281 	ifp->if_estimated_up_bucket = 0;
9282 	ifp->if_estimated_down_bucket = 0;
9283 	ifp->if_radio_type = 0;
9284 	ifp->if_radio_channel = 0;
9285 
9286 	VERIFY(ifp->if_delegated.ifp == NULL);
9287 	VERIFY(ifp->if_delegated.type == 0);
9288 	VERIFY(ifp->if_delegated.family == 0);
9289 	VERIFY(ifp->if_delegated.subfamily == 0);
9290 	VERIFY(ifp->if_delegated.expensive == 0);
9291 	VERIFY(ifp->if_delegated.constrained == 0);
9292 
9293 	VERIFY(ifp->if_agentids == NULL);
9294 	VERIFY(ifp->if_agentcount == 0);
9295 
9296 	/* Reset interface state */
9297 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9298 	ifp->if_interface_state.valid_bitmask |=
9299 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9300 	ifp->if_interface_state.interface_availability =
9301 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9302 
9303 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
9304 	if (ifp == lo_ifp) {
9305 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9306 		ifp->if_interface_state.valid_bitmask |=
9307 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
9308 	} else {
9309 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9310 	}
9311 
9312 	/*
9313 	 * Enable ECN capability on this interface depending on the
9314 	 * value of ECN global setting
9315 	 */
9316 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9317 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
9318 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9319 	}
9320 
9321 	/*
9322 	 * Built-in Cyclops always on policy for WiFi infra
9323 	 */
9324 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9325 		errno_t error;
9326 
9327 		error = if_set_qosmarking_mode(ifp,
9328 		    IFRTYPE_QOSMARKING_FASTLANE);
9329 		if (error != 0) {
9330 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9331 			    __func__, ifp->if_xname, error);
9332 		} else {
9333 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9334 #if (DEVELOPMENT || DEBUG)
9335 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9336 			    __func__, ifp->if_xname);
9337 #endif /* (DEVELOPMENT || DEBUG) */
9338 		}
9339 	}
9340 
9341 	ifnet_lock_done(ifp);
9342 	ifnet_head_done();
9343 
9344 #if SKYWALK
9345 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9346 #endif /* SKYWALK */
9347 
9348 	lck_mtx_lock(&ifp->if_cached_route_lock);
9349 	/* Enable forwarding cached route */
9350 	ifp->if_fwd_cacheok = 1;
9351 	/* Clean up any existing cached routes */
9352 	ROUTE_RELEASE(&ifp->if_fwd_route);
9353 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9354 	ROUTE_RELEASE(&ifp->if_src_route);
9355 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9356 	ROUTE_RELEASE(&ifp->if_src_route6);
9357 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9358 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9359 
9360 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9361 
9362 	/*
9363 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9364 	 * and trees; do this before the ifnet is marked as attached.
9365 	 * The ifnet keeps the reference to the info structures even after
9366 	 * the ifnet is detached, since the network-layer records still
9367 	 * refer to the info structures even after that.  This also
9368 	 * makes it possible for them to still function after the ifnet
9369 	 * is recycled or reattached.
9370 	 */
9371 #if INET
9372 	if (IGMP_IFINFO(ifp) == NULL) {
9373 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9374 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9375 	} else {
9376 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9377 		igmp_domifreattach(IGMP_IFINFO(ifp));
9378 	}
9379 #endif /* INET */
9380 	if (MLD_IFINFO(ifp) == NULL) {
9381 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9382 		VERIFY(MLD_IFINFO(ifp) != NULL);
9383 	} else {
9384 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9385 		mld_domifreattach(MLD_IFINFO(ifp));
9386 	}
9387 
9388 	VERIFY(ifp->if_data_threshold == 0);
9389 	VERIFY(ifp->if_dt_tcall != NULL);
9390 
9391 	/*
9392 	 * Wait for the created kernel threads for I/O to get
9393 	 * scheduled and run at least once before we proceed
9394 	 * to mark interface as attached.
9395 	 */
9396 	lck_mtx_lock(&ifp->if_ref_lock);
9397 	while (ifp->if_threads_pending != 0) {
9398 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9399 		    "interface %s to get scheduled at least once.\n",
9400 		    __func__, ifp->if_xname);
9401 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9402 		    __func__, NULL);
9403 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9404 	}
9405 	lck_mtx_unlock(&ifp->if_ref_lock);
9406 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9407 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9408 
9409 	/* Final mark this ifnet as attached. */
9410 	ifnet_lock_exclusive(ifp);
9411 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9412 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9413 	lck_mtx_unlock(&ifp->if_ref_lock);
9414 	if (net_rtref) {
9415 		/* boot-args override; enable idle notification */
9416 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9417 		    IFRF_IDLE_NOTIFY);
9418 	} else {
9419 		/* apply previous request(s) to set the idle flags, if any */
9420 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9421 		    ifp->if_idle_new_flags_mask);
9422 	}
9423 #if SKYWALK
9424 	/* the interface is fully attached; let the nexus adapter know */
9425 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9426 		if (netif_compat) {
9427 			if (sk_netif_compat_txmodel ==
9428 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9429 				ifnet_enqueue_multi_setup(ifp,
9430 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9431 			}
9432 			ifp->if_nx_netif = nexus_netif;
9433 		}
9434 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9435 	}
9436 #endif /* SKYWALK */
9437 	ifnet_lock_done(ifp);
9438 	dlil_if_unlock();
9439 
9440 #if PF
9441 	/*
9442 	 * Attach packet filter to this interface, if enabled.
9443 	 */
9444 	pf_ifnet_hook(ifp, 1);
9445 #endif /* PF */
9446 
9447 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9448 
9449 	if (dlil_verbose) {
9450 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9451 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9452 	}
9453 
9454 	return 0;
9455 }
9456 
9457 /*
9458  * Prepare the storage for the first/permanent link address, which must
9459  * must have the same lifetime as the ifnet itself.  Although the link
9460  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9461  * its location in memory must never change as it may still be referred
9462  * to by some parts of the system afterwards (unfortunate implementation
9463  * artifacts inherited from BSD.)
9464  *
9465  * Caller must hold ifnet lock as writer.
9466  */
9467 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9468 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9469 {
9470 	struct ifaddr *ifa, *oifa;
9471 	struct sockaddr_dl *asdl, *msdl;
9472 	char workbuf[IFNAMSIZ * 2];
9473 	int namelen, masklen, socksize;
9474 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9475 
9476 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9477 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9478 
9479 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9480 	    if_name(ifp));
9481 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9482 	    + ((namelen > 0) ? namelen : 0);
9483 	socksize = masklen + ifp->if_addrlen;
9484 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9485 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9486 		socksize = sizeof(struct sockaddr_dl);
9487 	}
9488 	socksize = ROUNDUP(socksize);
9489 #undef ROUNDUP
9490 
9491 	ifa = ifp->if_lladdr;
9492 	if (socksize > DLIL_SDLMAXLEN ||
9493 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9494 		/*
9495 		 * Rare, but in the event that the link address requires
9496 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9497 		 * largest possible storages for address and mask, such
9498 		 * that we can reuse the same space when if_addrlen grows.
9499 		 * This same space will be used when if_addrlen shrinks.
9500 		 */
9501 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9502 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9503 
9504 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9505 			ifa_lock_init(ifa);
9506 			/* Don't set IFD_ALLOC, as this is permanent */
9507 			ifa->ifa_debug = IFD_LINK;
9508 		}
9509 		IFA_LOCK(ifa);
9510 		/* address and mask sockaddr_dl locations */
9511 		asdl = (struct sockaddr_dl *)(ifa + 1);
9512 		bzero(asdl, SOCK_MAXADDRLEN);
9513 		msdl = (struct sockaddr_dl *)(void *)
9514 		    ((char *)asdl + SOCK_MAXADDRLEN);
9515 		bzero(msdl, SOCK_MAXADDRLEN);
9516 	} else {
9517 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9518 		/*
9519 		 * Use the storage areas for address and mask within the
9520 		 * dlil_ifnet structure.  This is the most common case.
9521 		 */
9522 		if (ifa == NULL) {
9523 			ifa = &dl_if->dl_if_lladdr.ifa;
9524 			ifa_lock_init(ifa);
9525 			/* Don't set IFD_ALLOC, as this is permanent */
9526 			ifa->ifa_debug = IFD_LINK;
9527 		}
9528 		IFA_LOCK(ifa);
9529 		/* address and mask sockaddr_dl locations */
9530 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9531 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9532 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9533 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9534 	}
9535 
9536 	/* hold a permanent reference for the ifnet itself */
9537 	IFA_ADDREF_LOCKED(ifa);
9538 	oifa = ifp->if_lladdr;
9539 	ifp->if_lladdr = ifa;
9540 
9541 	VERIFY(ifa->ifa_debug == IFD_LINK);
9542 	ifa->ifa_ifp = ifp;
9543 	ifa->ifa_rtrequest = link_rtrequest;
9544 	ifa->ifa_addr = (struct sockaddr *)asdl;
9545 	asdl->sdl_len = (u_char)socksize;
9546 	asdl->sdl_family = AF_LINK;
9547 	if (namelen > 0) {
9548 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9549 		    sizeof(asdl->sdl_data)));
9550 		asdl->sdl_nlen = (u_char)namelen;
9551 	} else {
9552 		asdl->sdl_nlen = 0;
9553 	}
9554 	asdl->sdl_index = ifp->if_index;
9555 	asdl->sdl_type = ifp->if_type;
9556 	if (ll_addr != NULL) {
9557 		asdl->sdl_alen = ll_addr->sdl_alen;
9558 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9559 	} else {
9560 		asdl->sdl_alen = 0;
9561 	}
9562 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9563 	msdl->sdl_len = (u_char)masklen;
9564 	while (namelen > 0) {
9565 		msdl->sdl_data[--namelen] = 0xff;
9566 	}
9567 	IFA_UNLOCK(ifa);
9568 
9569 	if (oifa != NULL) {
9570 		IFA_REMREF(oifa);
9571 	}
9572 
9573 	return ifa;
9574 }
9575 
9576 static void
if_purgeaddrs(struct ifnet * ifp)9577 if_purgeaddrs(struct ifnet *ifp)
9578 {
9579 #if INET
9580 	in_purgeaddrs(ifp);
9581 #endif /* INET */
9582 	in6_purgeaddrs(ifp);
9583 }
9584 
9585 errno_t
ifnet_detach(ifnet_t ifp)9586 ifnet_detach(ifnet_t ifp)
9587 {
9588 	struct ifnet *delegated_ifp;
9589 	struct nd_ifinfo *ndi = NULL;
9590 
9591 	if (ifp == NULL) {
9592 		return EINVAL;
9593 	}
9594 
9595 	ndi = ND_IFINFO(ifp);
9596 	if (NULL != ndi) {
9597 		ndi->cga_initialized = FALSE;
9598 	}
9599 
9600 	/* Mark the interface down */
9601 	if_down(ifp);
9602 
9603 	/*
9604 	 * IMPORTANT NOTE
9605 	 *
9606 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9607 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9608 	 * until after we've waited for all I/O references to drain
9609 	 * in ifnet_detach_final().
9610 	 */
9611 
9612 	ifnet_head_lock_exclusive();
9613 	ifnet_lock_exclusive(ifp);
9614 
9615 	if (ifp->if_output_netem != NULL) {
9616 		netem_destroy(ifp->if_output_netem);
9617 		ifp->if_output_netem = NULL;
9618 	}
9619 
9620 	/*
9621 	 * Check to see if this interface has previously triggered
9622 	 * aggressive protocol draining; if so, decrement the global
9623 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9624 	 * there are no more of such an interface around.
9625 	 */
9626 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9627 
9628 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9629 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9630 		lck_mtx_unlock(&ifp->if_ref_lock);
9631 		ifnet_lock_done(ifp);
9632 		ifnet_head_done();
9633 		return EINVAL;
9634 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9635 		/* Interface has already been detached */
9636 		lck_mtx_unlock(&ifp->if_ref_lock);
9637 		ifnet_lock_done(ifp);
9638 		ifnet_head_done();
9639 		return ENXIO;
9640 	}
9641 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9642 	/* Indicate this interface is being detached */
9643 	ifp->if_refflags &= ~IFRF_ATTACHED;
9644 	ifp->if_refflags |= IFRF_DETACHING;
9645 	lck_mtx_unlock(&ifp->if_ref_lock);
9646 
9647 	if (dlil_verbose) {
9648 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9649 	}
9650 
9651 	/* clean up flow control entry object if there's any */
9652 	if (ifp->if_eflags & IFEF_TXSTART) {
9653 		ifnet_flowadv(ifp->if_flowhash);
9654 	}
9655 
9656 	/* Reset ECN enable/disable flags */
9657 	/* Reset CLAT46 flag */
9658 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9659 
9660 	/*
9661 	 * We do not reset the TCP keep alive counters in case
9662 	 * a TCP connection stays connection after the interface
9663 	 * went down
9664 	 */
9665 	if (ifp->if_tcp_kao_cnt > 0) {
9666 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9667 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9668 	}
9669 	ifp->if_tcp_kao_max = 0;
9670 
9671 	/*
9672 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9673 	 * no longer be visible during lookups from this point.
9674 	 */
9675 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9676 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9677 	ifp->if_link.tqe_next = NULL;
9678 	ifp->if_link.tqe_prev = NULL;
9679 	if (ifp->if_ordered_link.tqe_next != NULL ||
9680 	    ifp->if_ordered_link.tqe_prev != NULL) {
9681 		ifnet_remove_from_ordered_list(ifp);
9682 	}
9683 	ifindex2ifnet[ifp->if_index] = NULL;
9684 
9685 	/* 18717626 - reset router mode */
9686 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9687 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9688 
9689 	/* Record detach PC stacktrace */
9690 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9691 
9692 	/* Clear logging parameters */
9693 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9694 
9695 	/* Clear delegated interface info (reference released below) */
9696 	delegated_ifp = ifp->if_delegated.ifp;
9697 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9698 
9699 	/* Reset interface state */
9700 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9701 
9702 	/*
9703 	 * Increment the generation count on interface deletion
9704 	 */
9705 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9706 
9707 	ifnet_lock_done(ifp);
9708 	ifnet_head_done();
9709 
9710 	/* Release reference held on the delegated interface */
9711 	if (delegated_ifp != NULL) {
9712 		ifnet_release(delegated_ifp);
9713 	}
9714 
9715 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9716 	if (ifp != lo_ifp) {
9717 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9718 	}
9719 
9720 	/* Reset TCP local statistics */
9721 	if (ifp->if_tcp_stat != NULL) {
9722 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9723 	}
9724 
9725 	/* Reset UDP local statistics */
9726 	if (ifp->if_udp_stat != NULL) {
9727 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9728 	}
9729 
9730 	/* Reset ifnet IPv4 stats */
9731 	if (ifp->if_ipv4_stat != NULL) {
9732 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9733 	}
9734 
9735 	/* Reset ifnet IPv6 stats */
9736 	if (ifp->if_ipv6_stat != NULL) {
9737 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9738 	}
9739 
9740 	/* Release memory held for interface link status report */
9741 	if (ifp->if_link_status != NULL) {
9742 		kfree_type(struct if_link_status, ifp->if_link_status);
9743 		ifp->if_link_status = NULL;
9744 	}
9745 
9746 	/* Disable forwarding cached route */
9747 	lck_mtx_lock(&ifp->if_cached_route_lock);
9748 	ifp->if_fwd_cacheok = 0;
9749 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9750 
9751 	/* Disable data threshold and wait for any pending event posting */
9752 	ifp->if_data_threshold = 0;
9753 	VERIFY(ifp->if_dt_tcall != NULL);
9754 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9755 
9756 	/*
9757 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9758 	 * references to the info structures and leave them attached to
9759 	 * this ifnet.
9760 	 */
9761 #if INET
9762 	igmp_domifdetach(ifp);
9763 #endif /* INET */
9764 	mld_domifdetach(ifp);
9765 
9766 #if SKYWALK
9767 	/* Clean up any netns tokens still pointing to to this ifnet */
9768 	netns_ifnet_detach(ifp);
9769 #endif /* SKYWALK */
9770 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9771 
9772 	/* Let worker thread take care of the rest, to avoid reentrancy */
9773 	dlil_if_lock();
9774 	ifnet_detaching_enqueue(ifp);
9775 	dlil_if_unlock();
9776 
9777 	return 0;
9778 }
9779 
9780 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9781 ifnet_detaching_enqueue(struct ifnet *ifp)
9782 {
9783 	dlil_if_lock_assert();
9784 
9785 	++ifnet_detaching_cnt;
9786 	VERIFY(ifnet_detaching_cnt != 0);
9787 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9788 	wakeup((caddr_t)&ifnet_delayed_run);
9789 }
9790 
9791 static struct ifnet *
ifnet_detaching_dequeue(void)9792 ifnet_detaching_dequeue(void)
9793 {
9794 	struct ifnet *ifp;
9795 
9796 	dlil_if_lock_assert();
9797 
9798 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9799 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9800 	if (ifp != NULL) {
9801 		VERIFY(ifnet_detaching_cnt != 0);
9802 		--ifnet_detaching_cnt;
9803 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9804 		ifp->if_detaching_link.tqe_next = NULL;
9805 		ifp->if_detaching_link.tqe_prev = NULL;
9806 	}
9807 	return ifp;
9808 }
9809 
9810 __attribute__((noreturn))
9811 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9812 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9813 {
9814 #pragma unused(v, wres)
9815 	struct ifnet *ifp;
9816 
9817 	dlil_if_lock();
9818 	if (__improbable(ifnet_detaching_embryonic)) {
9819 		ifnet_detaching_embryonic = FALSE;
9820 		/* there's no lock ordering constrain so OK to do this here */
9821 		dlil_decr_pending_thread_count();
9822 	}
9823 
9824 	for (;;) {
9825 		dlil_if_lock_assert();
9826 
9827 		if (ifnet_detaching_cnt == 0) {
9828 			break;
9829 		}
9830 
9831 		net_update_uptime();
9832 
9833 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9834 
9835 		/* Take care of detaching ifnet */
9836 		ifp = ifnet_detaching_dequeue();
9837 		if (ifp != NULL) {
9838 			dlil_if_unlock();
9839 			ifnet_detach_final(ifp);
9840 			dlil_if_lock();
9841 		}
9842 	}
9843 
9844 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9845 	dlil_if_unlock();
9846 	(void) thread_block(ifnet_detacher_thread_cont);
9847 
9848 	VERIFY(0);      /* we should never get here */
9849 	/* NOTREACHED */
9850 	__builtin_unreachable();
9851 }
9852 
9853 __dead2
9854 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9855 ifnet_detacher_thread_func(void *v, wait_result_t w)
9856 {
9857 #pragma unused(v, w)
9858 	dlil_if_lock();
9859 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9860 	ifnet_detaching_embryonic = TRUE;
9861 	/* wake up once to get out of embryonic state */
9862 	wakeup((caddr_t)&ifnet_delayed_run);
9863 	dlil_if_unlock();
9864 	(void) thread_block(ifnet_detacher_thread_cont);
9865 	VERIFY(0);
9866 	/* NOTREACHED */
9867 	__builtin_unreachable();
9868 }
9869 
9870 static void
ifnet_detach_final(struct ifnet * ifp)9871 ifnet_detach_final(struct ifnet *ifp)
9872 {
9873 	struct ifnet_filter *filter, *filter_next;
9874 	struct dlil_ifnet *dlifp;
9875 	struct ifnet_filter_head fhead;
9876 	struct dlil_threading_info *inp;
9877 	struct ifaddr *ifa;
9878 	ifnet_detached_func if_free;
9879 	int i;
9880 
9881 	/* Let BPF know we're detaching */
9882 	bpfdetach(ifp);
9883 
9884 #if SKYWALK
9885 	dlil_netif_detach_notify(ifp);
9886 	/*
9887 	 * Wait for the datapath to quiesce before tearing down
9888 	 * netif/flowswitch nexuses.
9889 	 */
9890 	dlil_quiesce_and_detach_nexuses(ifp);
9891 #endif /* SKYWALK */
9892 
9893 	lck_mtx_lock(&ifp->if_ref_lock);
9894 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9895 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9896 		    __func__, ifp);
9897 		/* NOTREACHED */
9898 	}
9899 
9900 	/*
9901 	 * Wait until the existing IO references get released
9902 	 * before we proceed with ifnet_detach.  This is not a
9903 	 * common case, so block without using a continuation.
9904 	 */
9905 	while (ifp->if_refio > 0) {
9906 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9907 		    "to be released\n", __func__, if_name(ifp));
9908 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9909 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9910 	}
9911 
9912 	VERIFY(ifp->if_datamov == 0);
9913 	VERIFY(ifp->if_drainers == 0);
9914 	VERIFY(ifp->if_suspend == 0);
9915 	ifp->if_refflags &= ~IFRF_READY;
9916 	lck_mtx_unlock(&ifp->if_ref_lock);
9917 
9918 	/* Clear agent IDs */
9919 	if (ifp->if_agentids != NULL) {
9920 		kfree_data(ifp->if_agentids,
9921 		    sizeof(uuid_t) * ifp->if_agentcount);
9922 		ifp->if_agentids = NULL;
9923 	}
9924 	ifp->if_agentcount = 0;
9925 
9926 #if SKYWALK
9927 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9928 #endif /* SKYWALK */
9929 	/* Drain and destroy send queue */
9930 	ifclassq_teardown(ifp->if_snd);
9931 
9932 	/* Detach interface filters */
9933 	lck_mtx_lock(&ifp->if_flt_lock);
9934 	if_flt_monitor_enter(ifp);
9935 
9936 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9937 	fhead = ifp->if_flt_head;
9938 	TAILQ_INIT(&ifp->if_flt_head);
9939 
9940 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9941 		filter_next = TAILQ_NEXT(filter, filt_next);
9942 		lck_mtx_unlock(&ifp->if_flt_lock);
9943 
9944 		dlil_detach_filter_internal(filter, 1);
9945 		lck_mtx_lock(&ifp->if_flt_lock);
9946 	}
9947 	if_flt_monitor_leave(ifp);
9948 	lck_mtx_unlock(&ifp->if_flt_lock);
9949 
9950 	/* Tell upper layers to drop their network addresses */
9951 	if_purgeaddrs(ifp);
9952 
9953 	ifnet_lock_exclusive(ifp);
9954 
9955 	/* Unplumb all protocols */
9956 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9957 		struct if_proto *proto;
9958 
9959 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9960 		while (proto != NULL) {
9961 			protocol_family_t family = proto->protocol_family;
9962 			ifnet_lock_done(ifp);
9963 			proto_unplumb(family, ifp);
9964 			ifnet_lock_exclusive(ifp);
9965 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9966 		}
9967 		/* There should not be any protocols left */
9968 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9969 	}
9970 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9971 	ifp->if_proto_hash = NULL;
9972 
9973 	/* Detach (permanent) link address from if_addrhead */
9974 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9975 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9976 	IFA_LOCK(ifa);
9977 	if_detach_link_ifa(ifp, ifa);
9978 	IFA_UNLOCK(ifa);
9979 
9980 	/* Remove (permanent) link address from ifnet_addrs[] */
9981 	IFA_REMREF(ifa);
9982 	ifnet_addrs[ifp->if_index - 1] = NULL;
9983 
9984 	/* This interface should not be on {ifnet_head,detaching} */
9985 	VERIFY(ifp->if_link.tqe_next == NULL);
9986 	VERIFY(ifp->if_link.tqe_prev == NULL);
9987 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9988 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9989 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9990 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9991 
9992 	/* The slot should have been emptied */
9993 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9994 
9995 	/* There should not be any addresses left */
9996 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9997 
9998 	/*
9999 	 * Signal the starter thread to terminate itself, and wait until
10000 	 * it has exited.
10001 	 */
10002 	if (ifp->if_start_thread != THREAD_NULL) {
10003 		lck_mtx_lock_spin(&ifp->if_start_lock);
10004 		ifp->if_start_flags |= IFSF_TERMINATING;
10005 		wakeup_one((caddr_t)&ifp->if_start_thread);
10006 		lck_mtx_unlock(&ifp->if_start_lock);
10007 
10008 		/* wait for starter thread to terminate */
10009 		lck_mtx_lock(&ifp->if_start_lock);
10010 		while (ifp->if_start_thread != THREAD_NULL) {
10011 			if (dlil_verbose) {
10012 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10013 				    __func__,
10014 				    if_name(ifp));
10015 			}
10016 			(void) msleep(&ifp->if_start_thread,
10017 			    &ifp->if_start_lock, (PZERO - 1),
10018 			    "ifnet_start_thread_exit", NULL);
10019 		}
10020 		lck_mtx_unlock(&ifp->if_start_lock);
10021 		if (dlil_verbose) {
10022 			DLIL_PRINTF("%s: %s starter thread termination complete",
10023 			    __func__, if_name(ifp));
10024 		}
10025 	}
10026 
10027 	/*
10028 	 * Signal the poller thread to terminate itself, and wait until
10029 	 * it has exited.
10030 	 */
10031 	if (ifp->if_poll_thread != THREAD_NULL) {
10032 #if SKYWALK
10033 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10034 #endif /* SKYWALK */
10035 		lck_mtx_lock_spin(&ifp->if_poll_lock);
10036 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10037 		wakeup_one((caddr_t)&ifp->if_poll_thread);
10038 		lck_mtx_unlock(&ifp->if_poll_lock);
10039 
10040 		/* wait for poller thread to terminate */
10041 		lck_mtx_lock(&ifp->if_poll_lock);
10042 		while (ifp->if_poll_thread != THREAD_NULL) {
10043 			if (dlil_verbose) {
10044 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10045 				    __func__,
10046 				    if_name(ifp));
10047 			}
10048 			(void) msleep(&ifp->if_poll_thread,
10049 			    &ifp->if_poll_lock, (PZERO - 1),
10050 			    "ifnet_poll_thread_exit", NULL);
10051 		}
10052 		lck_mtx_unlock(&ifp->if_poll_lock);
10053 		if (dlil_verbose) {
10054 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
10055 			    __func__, if_name(ifp));
10056 		}
10057 	}
10058 
10059 	/*
10060 	 * If thread affinity was set for the workloop thread, we will need
10061 	 * to tear down the affinity and release the extra reference count
10062 	 * taken at attach time.  Does not apply to lo0 or other interfaces
10063 	 * without dedicated input threads.
10064 	 */
10065 	if ((inp = ifp->if_inp) != NULL) {
10066 		VERIFY(inp != dlil_main_input_thread);
10067 
10068 		if (inp->dlth_affinity) {
10069 			struct thread *tp, *wtp, *ptp;
10070 
10071 			lck_mtx_lock_spin(&inp->dlth_lock);
10072 			wtp = inp->dlth_driver_thread;
10073 			inp->dlth_driver_thread = THREAD_NULL;
10074 			ptp = inp->dlth_poller_thread;
10075 			inp->dlth_poller_thread = THREAD_NULL;
10076 			ASSERT(inp->dlth_thread != THREAD_NULL);
10077 			tp = inp->dlth_thread;    /* don't nullify now */
10078 			inp->dlth_affinity_tag = 0;
10079 			inp->dlth_affinity = FALSE;
10080 			lck_mtx_unlock(&inp->dlth_lock);
10081 
10082 			/* Tear down poll thread affinity */
10083 			if (ptp != NULL) {
10084 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10085 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
10086 				(void) dlil_affinity_set(ptp,
10087 				    THREAD_AFFINITY_TAG_NULL);
10088 				thread_deallocate(ptp);
10089 			}
10090 
10091 			/* Tear down workloop thread affinity */
10092 			if (wtp != NULL) {
10093 				(void) dlil_affinity_set(wtp,
10094 				    THREAD_AFFINITY_TAG_NULL);
10095 				thread_deallocate(wtp);
10096 			}
10097 
10098 			/* Tear down DLIL input thread affinity */
10099 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10100 			thread_deallocate(tp);
10101 		}
10102 
10103 		/* disassociate ifp DLIL input thread */
10104 		ifp->if_inp = NULL;
10105 
10106 		/* if the worker thread was created, tell it to terminate */
10107 		if (inp->dlth_thread != THREAD_NULL) {
10108 			lck_mtx_lock_spin(&inp->dlth_lock);
10109 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10110 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10111 				wakeup_one((caddr_t)&inp->dlth_flags);
10112 			}
10113 			lck_mtx_unlock(&inp->dlth_lock);
10114 			ifnet_lock_done(ifp);
10115 
10116 			/* wait for the input thread to terminate */
10117 			lck_mtx_lock_spin(&inp->dlth_lock);
10118 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10119 			    == 0) {
10120 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10121 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10122 			}
10123 			lck_mtx_unlock(&inp->dlth_lock);
10124 			ifnet_lock_exclusive(ifp);
10125 		}
10126 
10127 		/* clean-up input thread state */
10128 		dlil_clean_threading_info(inp);
10129 		/* clean-up poll parameters */
10130 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
10131 		dlil_reset_rxpoll_params(ifp);
10132 	}
10133 
10134 	/* The driver might unload, so point these to ourselves */
10135 	if_free = ifp->if_free;
10136 	ifp->if_output_dlil = ifp_if_output;
10137 	ifp->if_output = ifp_if_output;
10138 	ifp->if_pre_enqueue = ifp_if_output;
10139 	ifp->if_start = ifp_if_start;
10140 	ifp->if_output_ctl = ifp_if_ctl;
10141 	ifp->if_input_dlil = ifp_if_input;
10142 	ifp->if_input_poll = ifp_if_input_poll;
10143 	ifp->if_input_ctl = ifp_if_ctl;
10144 	ifp->if_ioctl = ifp_if_ioctl;
10145 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10146 	ifp->if_free = ifp_if_free;
10147 	ifp->if_demux = ifp_if_demux;
10148 	ifp->if_event = ifp_if_event;
10149 	ifp->if_framer_legacy = ifp_if_framer;
10150 	ifp->if_framer = ifp_if_framer_extended;
10151 	ifp->if_add_proto = ifp_if_add_proto;
10152 	ifp->if_del_proto = ifp_if_del_proto;
10153 	ifp->if_check_multi = ifp_if_check_multi;
10154 
10155 	/* wipe out interface description */
10156 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10157 	ifp->if_desc.ifd_len = 0;
10158 	VERIFY(ifp->if_desc.ifd_desc != NULL);
10159 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10160 
10161 	/* there shouldn't be any delegation by now */
10162 	VERIFY(ifp->if_delegated.ifp == NULL);
10163 	VERIFY(ifp->if_delegated.type == 0);
10164 	VERIFY(ifp->if_delegated.family == 0);
10165 	VERIFY(ifp->if_delegated.subfamily == 0);
10166 	VERIFY(ifp->if_delegated.expensive == 0);
10167 	VERIFY(ifp->if_delegated.constrained == 0);
10168 
10169 	/* QoS marking get cleared */
10170 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10171 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10172 
10173 #if SKYWALK
10174 	/* the nexus destructor is responsible for clearing these */
10175 	VERIFY(ifp->if_na_ops == NULL);
10176 	VERIFY(ifp->if_na == NULL);
10177 #endif /* SKYWALK */
10178 
10179 	/* promiscuous/allmulti counts need to start at zero again */
10180 	ifp->if_pcount = 0;
10181 	ifp->if_amcount = 0;
10182 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10183 
10184 	ifnet_lock_done(ifp);
10185 
10186 #if PF
10187 	/*
10188 	 * Detach this interface from packet filter, if enabled.
10189 	 */
10190 	pf_ifnet_hook(ifp, 0);
10191 #endif /* PF */
10192 
10193 	/* Filter list should be empty */
10194 	lck_mtx_lock_spin(&ifp->if_flt_lock);
10195 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10196 	VERIFY(ifp->if_flt_busy == 0);
10197 	VERIFY(ifp->if_flt_waiters == 0);
10198 	VERIFY(ifp->if_flt_non_os_count == 0);
10199 	VERIFY(ifp->if_flt_no_tso_count == 0);
10200 	lck_mtx_unlock(&ifp->if_flt_lock);
10201 
10202 	/* Last chance to drain send queue */
10203 	if_qflush_snd(ifp, 0);
10204 
10205 	/* Last chance to cleanup any cached route */
10206 	lck_mtx_lock(&ifp->if_cached_route_lock);
10207 	VERIFY(!ifp->if_fwd_cacheok);
10208 	ROUTE_RELEASE(&ifp->if_fwd_route);
10209 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10210 	ROUTE_RELEASE(&ifp->if_src_route);
10211 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10212 	ROUTE_RELEASE(&ifp->if_src_route6);
10213 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10214 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10215 
10216 	VERIFY(ifp->if_data_threshold == 0);
10217 	VERIFY(ifp->if_dt_tcall != NULL);
10218 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10219 
10220 	ifnet_llreach_ifdetach(ifp);
10221 
10222 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10223 
10224 	/*
10225 	 * Finally, mark this ifnet as detached.
10226 	 */
10227 	if (dlil_verbose) {
10228 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
10229 	}
10230 	lck_mtx_lock_spin(&ifp->if_ref_lock);
10231 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
10232 		panic("%s: flags mismatch (detaching not set) ifp=%p",
10233 		    __func__, ifp);
10234 		/* NOTREACHED */
10235 	}
10236 	ifp->if_refflags &= ~IFRF_DETACHING;
10237 	lck_mtx_unlock(&ifp->if_ref_lock);
10238 	if (if_free != NULL) {
10239 		if_free(ifp);
10240 	}
10241 
10242 	ifclassq_release(&ifp->if_snd);
10243 
10244 	/* we're fully detached, clear the "in use" bit */
10245 	dlifp = (struct dlil_ifnet *)ifp;
10246 	lck_mtx_lock(&dlifp->dl_if_lock);
10247 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10248 	dlifp->dl_if_flags &= ~DLIF_INUSE;
10249 	lck_mtx_unlock(&dlifp->dl_if_lock);
10250 
10251 	/* Release reference held during ifnet attach */
10252 	ifnet_release(ifp);
10253 }
10254 
10255 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10256 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10257 {
10258 #pragma unused(ifp)
10259 	m_freem_list(m);
10260 	return 0;
10261 }
10262 
10263 void
ifp_if_start(struct ifnet * ifp)10264 ifp_if_start(struct ifnet *ifp)
10265 {
10266 	ifnet_purge(ifp);
10267 }
10268 
10269 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10270 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10271     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10272     boolean_t poll, struct thread *tp)
10273 {
10274 #pragma unused(ifp, m_tail, s, poll, tp)
10275 	m_freem_list(m_head);
10276 	return ENXIO;
10277 }
10278 
10279 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10280 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10281     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10282 {
10283 #pragma unused(ifp, flags, max_cnt)
10284 	if (m_head != NULL) {
10285 		*m_head = NULL;
10286 	}
10287 	if (m_tail != NULL) {
10288 		*m_tail = NULL;
10289 	}
10290 	if (cnt != NULL) {
10291 		*cnt = 0;
10292 	}
10293 	if (len != NULL) {
10294 		*len = 0;
10295 	}
10296 }
10297 
10298 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10299 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10300 {
10301 #pragma unused(ifp, cmd, arglen, arg)
10302 	return EOPNOTSUPP;
10303 }
10304 
10305 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10306 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10307 {
10308 #pragma unused(ifp, fh, pf)
10309 	m_freem(m);
10310 	return EJUSTRETURN;
10311 }
10312 
10313 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10314 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10315     const struct ifnet_demux_desc *da, u_int32_t dc)
10316 {
10317 #pragma unused(ifp, pf, da, dc)
10318 	return EINVAL;
10319 }
10320 
10321 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10322 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10323 {
10324 #pragma unused(ifp, pf)
10325 	return EINVAL;
10326 }
10327 
10328 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10329 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10330 {
10331 #pragma unused(ifp, sa)
10332 	return EOPNOTSUPP;
10333 }
10334 
10335 #if !XNU_TARGET_OS_OSX
10336 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10337 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10338     const struct sockaddr *sa, const char *ll, const char *t,
10339     u_int32_t *pre, u_int32_t *post)
10340 #else /* XNU_TARGET_OS_OSX */
10341 static errno_t
10342 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10343     const struct sockaddr *sa, const char *ll, const char *t)
10344 #endif /* XNU_TARGET_OS_OSX */
10345 {
10346 #pragma unused(ifp, m, sa, ll, t)
10347 #if !XNU_TARGET_OS_OSX
10348 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10349 #else /* XNU_TARGET_OS_OSX */
10350 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10351 #endif /* XNU_TARGET_OS_OSX */
10352 }
10353 
10354 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10355 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10356     const struct sockaddr *sa, const char *ll, const char *t,
10357     u_int32_t *pre, u_int32_t *post)
10358 {
10359 #pragma unused(ifp, sa, ll, t)
10360 	m_freem(*m);
10361 	*m = NULL;
10362 
10363 	if (pre != NULL) {
10364 		*pre = 0;
10365 	}
10366 	if (post != NULL) {
10367 		*post = 0;
10368 	}
10369 
10370 	return EJUSTRETURN;
10371 }
10372 
10373 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10374 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10375 {
10376 #pragma unused(ifp, cmd, arg)
10377 	return EOPNOTSUPP;
10378 }
10379 
10380 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10381 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10382 {
10383 #pragma unused(ifp, tm, f)
10384 	/* XXX not sure what to do here */
10385 	return 0;
10386 }
10387 
10388 static void
ifp_if_free(struct ifnet * ifp)10389 ifp_if_free(struct ifnet *ifp)
10390 {
10391 #pragma unused(ifp)
10392 }
10393 
10394 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10395 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10396 {
10397 #pragma unused(ifp, e)
10398 }
10399 
10400 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10401 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10402     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10403 {
10404 	struct ifnet *ifp1 = NULL;
10405 	struct dlil_ifnet *dlifp1 = NULL;
10406 	struct dlil_ifnet *dlifp1_saved = NULL;
10407 	void *buf, *base, **pbuf;
10408 	int ret = 0;
10409 
10410 	VERIFY(*ifp == NULL);
10411 	dlil_if_lock();
10412 	/*
10413 	 * We absolutely can't have an interface with the same name
10414 	 * in in-use state.
10415 	 * To make sure of that list has to be traversed completely
10416 	 */
10417 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10418 		ifp1 = (struct ifnet *)dlifp1;
10419 
10420 		if (ifp1->if_family != family) {
10421 			continue;
10422 		}
10423 
10424 		/*
10425 		 * If interface is in use, return EBUSY if either unique id
10426 		 * or interface extended names are the same
10427 		 */
10428 		lck_mtx_lock(&dlifp1->dl_if_lock);
10429 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10430 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10431 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10432 			ret = EBUSY;
10433 			goto end;
10434 		}
10435 
10436 		if (uniqueid_len != 0 &&
10437 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10438 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10439 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10440 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10441 				ret = EBUSY;
10442 				goto end;
10443 			}
10444 			if (dlifp1_saved == NULL) {
10445 				/* cache the first match */
10446 				dlifp1_saved = dlifp1;
10447 			}
10448 			/*
10449 			 * Do not break or jump to end as we have to traverse
10450 			 * the whole list to ensure there are no name collisions
10451 			 */
10452 		}
10453 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10454 	}
10455 
10456 	/* If there's an interface that can be recycled, use that */
10457 	if (dlifp1_saved != NULL) {
10458 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10459 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10460 			/* some other thread got in ahead of us */
10461 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10462 			ret = EBUSY;
10463 			goto end;
10464 		}
10465 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10466 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10467 		*ifp = (struct ifnet *)dlifp1_saved;
10468 		dlil_if_ref(*ifp);
10469 		goto end;
10470 	}
10471 
10472 	/* no interface found, allocate a new one */
10473 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10474 
10475 	/* Get the 64-bit aligned base address for this object */
10476 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10477 	    sizeof(u_int64_t));
10478 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10479 
10480 	/*
10481 	 * Wind back a pointer size from the aligned base and
10482 	 * save the original address so we can free it later.
10483 	 */
10484 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10485 	*pbuf = buf;
10486 	dlifp1 = base;
10487 
10488 	if (uniqueid_len) {
10489 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10490 		    Z_WAITOK);
10491 		if (dlifp1->dl_if_uniqueid == NULL) {
10492 			zfree(dlif_zone, buf);
10493 			ret = ENOMEM;
10494 			goto end;
10495 		}
10496 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10497 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10498 	}
10499 
10500 	ifp1 = (struct ifnet *)dlifp1;
10501 	dlifp1->dl_if_flags = DLIF_INUSE;
10502 	if (ifnet_debug) {
10503 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10504 		dlifp1->dl_if_trace = dlil_if_trace;
10505 	}
10506 	ifp1->if_name = dlifp1->dl_if_namestorage;
10507 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10508 
10509 	/* initialize interface description */
10510 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10511 	ifp1->if_desc.ifd_len = 0;
10512 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10513 
10514 #if SKYWALK
10515 	SLIST_INIT(&ifp1->if_netns_tokens);
10516 #endif /* SKYWALK */
10517 
10518 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10519 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10520 		    "error: %d\n", __func__, ret);
10521 		/* This probably shouldn't be fatal */
10522 		ret = 0;
10523 	}
10524 
10525 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10526 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10527 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10528 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10529 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10530 	    &ifnet_lock_attr);
10531 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10532 #if INET
10533 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10534 	    &ifnet_lock_attr);
10535 	ifp1->if_inetdata = NULL;
10536 #endif
10537 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10538 	ifp1->if_inet6_ioctl_busy = FALSE;
10539 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10540 	    &ifnet_lock_attr);
10541 	ifp1->if_inet6data = NULL;
10542 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10543 	    &ifnet_lock_attr);
10544 	ifp1->if_link_status = NULL;
10545 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10546 
10547 	/* for send data paths */
10548 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10549 	    &ifnet_lock_attr);
10550 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10551 	    &ifnet_lock_attr);
10552 
10553 	/* for receive data paths */
10554 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10555 	    &ifnet_lock_attr);
10556 
10557 	/* thread call allocation is done with sleeping zalloc */
10558 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10559 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10560 	if (ifp1->if_dt_tcall == NULL) {
10561 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10562 		/* NOTREACHED */
10563 	}
10564 
10565 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10566 
10567 	*ifp = ifp1;
10568 	dlil_if_ref(*ifp);
10569 
10570 end:
10571 	dlil_if_unlock();
10572 
10573 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10574 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10575 
10576 	return ret;
10577 }
10578 
10579 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10580 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10581 {
10582 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10583 
10584 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10585 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10586 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10587 	}
10588 
10589 	ifnet_lock_exclusive(ifp);
10590 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10591 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10592 		ifp->if_broadcast.length = 0;
10593 		ifp->if_broadcast.u.ptr = NULL;
10594 	}
10595 	lck_mtx_lock(&dlifp->dl_if_lock);
10596 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10597 	ifp->if_name = dlifp->dl_if_namestorage;
10598 	/* Reset external name (name + unit) */
10599 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10600 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10601 	    "%s?", ifp->if_name);
10602 	if (clear_in_use) {
10603 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10604 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10605 	}
10606 	lck_mtx_unlock(&dlifp->dl_if_lock);
10607 	ifnet_lock_done(ifp);
10608 }
10609 
10610 __private_extern__ void
dlil_if_release(ifnet_t ifp)10611 dlil_if_release(ifnet_t ifp)
10612 {
10613 	_dlil_if_release(ifp, false);
10614 }
10615 
10616 __private_extern__ void
dlil_if_lock(void)10617 dlil_if_lock(void)
10618 {
10619 	lck_mtx_lock(&dlil_ifnet_lock);
10620 }
10621 
10622 __private_extern__ void
dlil_if_unlock(void)10623 dlil_if_unlock(void)
10624 {
10625 	lck_mtx_unlock(&dlil_ifnet_lock);
10626 }
10627 
10628 __private_extern__ void
dlil_if_lock_assert(void)10629 dlil_if_lock_assert(void)
10630 {
10631 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10632 }
10633 
10634 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10635 dlil_proto_unplumb_all(struct ifnet *ifp)
10636 {
10637 	/*
10638 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10639 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10640 	 * explicit unplumb.
10641 	 *
10642 	 * if_proto_hash[3] is for other protocols; we expect anything
10643 	 * in this bucket to respond to the DETACHING event (which would
10644 	 * have happened by now) and do the unplumb then.
10645 	 */
10646 	(void) proto_unplumb(PF_INET, ifp);
10647 	(void) proto_unplumb(PF_INET6, ifp);
10648 }
10649 
10650 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10651 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10652 {
10653 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10654 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10655 
10656 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10657 
10658 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10659 }
10660 
10661 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10662 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10663 {
10664 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10665 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10666 
10667 	if (ifp->if_fwd_cacheok) {
10668 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10669 	} else {
10670 		ROUTE_RELEASE(src);
10671 	}
10672 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10673 }
10674 
10675 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10676 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10677 {
10678 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10679 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10680 
10681 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10682 	    sizeof(*dst));
10683 
10684 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10685 }
10686 
10687 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10688 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10689 {
10690 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10691 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10692 
10693 	if (ifp->if_fwd_cacheok) {
10694 		route_copyin((struct route *)src,
10695 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10696 	} else {
10697 		ROUTE_RELEASE(src);
10698 	}
10699 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10700 }
10701 
10702 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10703 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10704 {
10705 	struct route            src_rt;
10706 	struct sockaddr_in      *dst;
10707 
10708 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10709 
10710 	ifp_src_route_copyout(ifp, &src_rt);
10711 
10712 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10713 		ROUTE_RELEASE(&src_rt);
10714 		if (dst->sin_family != AF_INET) {
10715 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10716 			dst->sin_len = sizeof(src_rt.ro_dst);
10717 			dst->sin_family = AF_INET;
10718 		}
10719 		dst->sin_addr = src_ip;
10720 
10721 		VERIFY(src_rt.ro_rt == NULL);
10722 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10723 		    0, 0, ifp->if_index);
10724 
10725 		if (src_rt.ro_rt != NULL) {
10726 			/* retain a ref, copyin consumes one */
10727 			struct rtentry  *rte = src_rt.ro_rt;
10728 			RT_ADDREF(rte);
10729 			ifp_src_route_copyin(ifp, &src_rt);
10730 			src_rt.ro_rt = rte;
10731 		}
10732 	}
10733 
10734 	return src_rt.ro_rt;
10735 }
10736 
10737 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10738 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10739 {
10740 	struct route_in6 src_rt;
10741 
10742 	ifp_src_route6_copyout(ifp, &src_rt);
10743 
10744 	if (ROUTE_UNUSABLE(&src_rt) ||
10745 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10746 		ROUTE_RELEASE(&src_rt);
10747 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10748 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10749 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10750 			src_rt.ro_dst.sin6_family = AF_INET6;
10751 		}
10752 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10753 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10754 		    sizeof(src_rt.ro_dst.sin6_addr));
10755 
10756 		if (src_rt.ro_rt == NULL) {
10757 			src_rt.ro_rt = rtalloc1_scoped(
10758 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10759 				ifp->if_index);
10760 
10761 			if (src_rt.ro_rt != NULL) {
10762 				/* retain a ref, copyin consumes one */
10763 				struct rtentry  *rte = src_rt.ro_rt;
10764 				RT_ADDREF(rte);
10765 				ifp_src_route6_copyin(ifp, &src_rt);
10766 				src_rt.ro_rt = rte;
10767 			}
10768 		}
10769 	}
10770 
10771 	return src_rt.ro_rt;
10772 }
10773 
10774 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10775 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10776 {
10777 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10778 
10779 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10780 
10781 	/* Normalize to edge */
10782 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10783 		lqm = IFNET_LQM_THRESH_ABORT;
10784 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10785 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10786 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10787 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10788 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10789 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10790 	    lqm <= IFNET_LQM_THRESH_POOR) {
10791 		lqm = IFNET_LQM_THRESH_POOR;
10792 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10793 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10794 		lqm = IFNET_LQM_THRESH_GOOD;
10795 	}
10796 
10797 	/*
10798 	 * Take the lock if needed
10799 	 */
10800 	if (!locked) {
10801 		ifnet_lock_exclusive(ifp);
10802 	}
10803 
10804 	if (lqm == ifp->if_interface_state.lqm_state &&
10805 	    (ifp->if_interface_state.valid_bitmask &
10806 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10807 		/*
10808 		 * Release the lock if was not held by the caller
10809 		 */
10810 		if (!locked) {
10811 			ifnet_lock_done(ifp);
10812 		}
10813 		return;         /* nothing to update */
10814 	}
10815 	ifp->if_interface_state.valid_bitmask |=
10816 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10817 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10818 
10819 	/*
10820 	 * Don't want to hold the lock when issuing kernel events
10821 	 */
10822 	ifnet_lock_done(ifp);
10823 
10824 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10825 	ev_lqm_data.link_quality_metric = lqm;
10826 
10827 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10828 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10829 
10830 	/*
10831 	 * Reacquire the lock for the caller
10832 	 */
10833 	if (locked) {
10834 		ifnet_lock_exclusive(ifp);
10835 	}
10836 }
10837 
10838 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10839 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10840 {
10841 	struct kev_dl_rrc_state kev;
10842 
10843 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10844 	    (ifp->if_interface_state.valid_bitmask &
10845 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10846 		return;
10847 	}
10848 
10849 	ifp->if_interface_state.valid_bitmask |=
10850 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10851 
10852 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10853 
10854 	/*
10855 	 * Don't want to hold the lock when issuing kernel events
10856 	 */
10857 	ifnet_lock_done(ifp);
10858 
10859 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10860 	kev.rrc_state = rrc_state;
10861 
10862 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10863 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10864 
10865 	ifnet_lock_exclusive(ifp);
10866 }
10867 
10868 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10869 if_state_update(struct ifnet *ifp,
10870     struct if_interface_state *if_interface_state)
10871 {
10872 	u_short if_index_available = 0;
10873 
10874 	ifnet_lock_exclusive(ifp);
10875 
10876 	if ((ifp->if_type != IFT_CELLULAR) &&
10877 	    (if_interface_state->valid_bitmask &
10878 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10879 		ifnet_lock_done(ifp);
10880 		return ENOTSUP;
10881 	}
10882 	if ((if_interface_state->valid_bitmask &
10883 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10884 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10885 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10886 		ifnet_lock_done(ifp);
10887 		return EINVAL;
10888 	}
10889 	if ((if_interface_state->valid_bitmask &
10890 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10891 	    if_interface_state->rrc_state !=
10892 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10893 	    if_interface_state->rrc_state !=
10894 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10895 		ifnet_lock_done(ifp);
10896 		return EINVAL;
10897 	}
10898 
10899 	if (if_interface_state->valid_bitmask &
10900 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10901 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10902 	}
10903 	if (if_interface_state->valid_bitmask &
10904 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10905 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10906 	}
10907 	if (if_interface_state->valid_bitmask &
10908 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10909 		ifp->if_interface_state.valid_bitmask |=
10910 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10911 		ifp->if_interface_state.interface_availability =
10912 		    if_interface_state->interface_availability;
10913 
10914 		if (ifp->if_interface_state.interface_availability ==
10915 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10916 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10917 			    __func__, if_name(ifp), ifp->if_index);
10918 			if_index_available = ifp->if_index;
10919 		} else {
10920 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10921 			    __func__, if_name(ifp), ifp->if_index);
10922 		}
10923 	}
10924 	ifnet_lock_done(ifp);
10925 
10926 	/*
10927 	 * Check if the TCP connections going on this interface should be
10928 	 * forced to send probe packets instead of waiting for TCP timers
10929 	 * to fire. This is done on an explicit notification such as
10930 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10931 	 */
10932 	if (if_index_available > 0) {
10933 		tcp_interface_send_probe(if_index_available);
10934 	}
10935 
10936 	return 0;
10937 }
10938 
10939 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10940 if_get_state(struct ifnet *ifp,
10941     struct if_interface_state *if_interface_state)
10942 {
10943 	ifnet_lock_shared(ifp);
10944 
10945 	if_interface_state->valid_bitmask = 0;
10946 
10947 	if (ifp->if_interface_state.valid_bitmask &
10948 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10949 		if_interface_state->valid_bitmask |=
10950 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10951 		if_interface_state->rrc_state =
10952 		    ifp->if_interface_state.rrc_state;
10953 	}
10954 	if (ifp->if_interface_state.valid_bitmask &
10955 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10956 		if_interface_state->valid_bitmask |=
10957 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10958 		if_interface_state->lqm_state =
10959 		    ifp->if_interface_state.lqm_state;
10960 	}
10961 	if (ifp->if_interface_state.valid_bitmask &
10962 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10963 		if_interface_state->valid_bitmask |=
10964 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10965 		if_interface_state->interface_availability =
10966 		    ifp->if_interface_state.interface_availability;
10967 	}
10968 
10969 	ifnet_lock_done(ifp);
10970 }
10971 
10972 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10973 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10974 {
10975 	if (conn_probe > 1) {
10976 		return EINVAL;
10977 	}
10978 	if (conn_probe == 0) {
10979 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10980 	} else {
10981 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10982 	}
10983 
10984 #if NECP
10985 	necp_update_all_clients();
10986 #endif /* NECP */
10987 
10988 	tcp_probe_connectivity(ifp, conn_probe);
10989 	return 0;
10990 }
10991 
10992 /* for uuid.c */
10993 static int
get_ether_index(int * ret_other_index)10994 get_ether_index(int * ret_other_index)
10995 {
10996 	struct ifnet *ifp;
10997 	int en0_index = 0;
10998 	int other_en_index = 0;
10999 	int any_ether_index = 0;
11000 	short best_unit = 0;
11001 
11002 	*ret_other_index = 0;
11003 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11004 		/*
11005 		 * find en0, or if not en0, the lowest unit en*, and if not
11006 		 * that, any ethernet
11007 		 */
11008 		ifnet_lock_shared(ifp);
11009 		if (strcmp(ifp->if_name, "en") == 0) {
11010 			if (ifp->if_unit == 0) {
11011 				/* found en0, we're done */
11012 				en0_index = ifp->if_index;
11013 				ifnet_lock_done(ifp);
11014 				break;
11015 			}
11016 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
11017 				other_en_index = ifp->if_index;
11018 				best_unit = ifp->if_unit;
11019 			}
11020 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11021 			any_ether_index = ifp->if_index;
11022 		}
11023 		ifnet_lock_done(ifp);
11024 	}
11025 	if (en0_index == 0) {
11026 		if (other_en_index != 0) {
11027 			*ret_other_index = other_en_index;
11028 		} else if (any_ether_index != 0) {
11029 			*ret_other_index = any_ether_index;
11030 		}
11031 	}
11032 	return en0_index;
11033 }
11034 
11035 int
uuid_get_ethernet(u_int8_t * node)11036 uuid_get_ethernet(u_int8_t *node)
11037 {
11038 	static int en0_index;
11039 	struct ifnet *ifp;
11040 	int other_index = 0;
11041 	int the_index = 0;
11042 	int ret;
11043 
11044 	ifnet_head_lock_shared();
11045 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11046 		en0_index = get_ether_index(&other_index);
11047 	}
11048 	if (en0_index != 0) {
11049 		the_index = en0_index;
11050 	} else if (other_index != 0) {
11051 		the_index = other_index;
11052 	}
11053 	if (the_index != 0) {
11054 		struct dlil_ifnet *dl_if;
11055 
11056 		ifp = ifindex2ifnet[the_index];
11057 		VERIFY(ifp != NULL);
11058 		dl_if = (struct dlil_ifnet *)ifp;
11059 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
11060 			/*
11061 			 * Use the permanent ethernet address if it is
11062 			 * available because it will never change.
11063 			 */
11064 			memcpy(node, dl_if->dl_if_permanent_ether,
11065 			    ETHER_ADDR_LEN);
11066 		} else {
11067 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11068 		}
11069 		ret = 0;
11070 	} else {
11071 		ret = -1;
11072 	}
11073 	ifnet_head_done();
11074 	return ret;
11075 }
11076 
11077 static int
11078 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11079 {
11080 #pragma unused(arg1, arg2)
11081 	uint32_t i;
11082 	int err;
11083 
11084 	i = if_rxpoll;
11085 
11086 	err = sysctl_handle_int(oidp, &i, 0, req);
11087 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11088 		return err;
11089 	}
11090 
11091 	if (net_rxpoll == 0) {
11092 		return ENXIO;
11093 	}
11094 
11095 	if_rxpoll = i;
11096 	return err;
11097 }
11098 
11099 static int
11100 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11101 {
11102 #pragma unused(arg1, arg2)
11103 	uint64_t q;
11104 	int err;
11105 
11106 	q = if_rxpoll_mode_holdtime;
11107 
11108 	err = sysctl_handle_quad(oidp, &q, 0, req);
11109 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11110 		return err;
11111 	}
11112 
11113 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11114 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11115 	}
11116 
11117 	if_rxpoll_mode_holdtime = q;
11118 
11119 	return err;
11120 }
11121 
11122 static int
11123 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11124 {
11125 #pragma unused(arg1, arg2)
11126 	uint64_t q;
11127 	int err;
11128 
11129 	q = if_rxpoll_sample_holdtime;
11130 
11131 	err = sysctl_handle_quad(oidp, &q, 0, req);
11132 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11133 		return err;
11134 	}
11135 
11136 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11137 		q = IF_RXPOLL_SAMPLETIME_MIN;
11138 	}
11139 
11140 	if_rxpoll_sample_holdtime = q;
11141 
11142 	return err;
11143 }
11144 
11145 static int
11146 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11147 {
11148 #pragma unused(arg1, arg2)
11149 	uint64_t q;
11150 	int err;
11151 
11152 	q = if_rxpoll_interval_time;
11153 
11154 	err = sysctl_handle_quad(oidp, &q, 0, req);
11155 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11156 		return err;
11157 	}
11158 
11159 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11160 		q = IF_RXPOLL_INTERVALTIME_MIN;
11161 	}
11162 
11163 	if_rxpoll_interval_time = q;
11164 
11165 	return err;
11166 }
11167 
11168 static int
11169 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11170 {
11171 #pragma unused(arg1, arg2)
11172 	uint32_t i;
11173 	int err;
11174 
11175 	i = if_sysctl_rxpoll_wlowat;
11176 
11177 	err = sysctl_handle_int(oidp, &i, 0, req);
11178 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11179 		return err;
11180 	}
11181 
11182 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11183 		return EINVAL;
11184 	}
11185 
11186 	if_sysctl_rxpoll_wlowat = i;
11187 	return err;
11188 }
11189 
11190 static int
11191 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11192 {
11193 #pragma unused(arg1, arg2)
11194 	uint32_t i;
11195 	int err;
11196 
11197 	i = if_sysctl_rxpoll_whiwat;
11198 
11199 	err = sysctl_handle_int(oidp, &i, 0, req);
11200 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11201 		return err;
11202 	}
11203 
11204 	if (i <= if_sysctl_rxpoll_wlowat) {
11205 		return EINVAL;
11206 	}
11207 
11208 	if_sysctl_rxpoll_whiwat = i;
11209 	return err;
11210 }
11211 
11212 static int
11213 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11214 {
11215 #pragma unused(arg1, arg2)
11216 	int i, err;
11217 
11218 	i = if_sndq_maxlen;
11219 
11220 	err = sysctl_handle_int(oidp, &i, 0, req);
11221 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11222 		return err;
11223 	}
11224 
11225 	if (i < IF_SNDQ_MINLEN) {
11226 		i = IF_SNDQ_MINLEN;
11227 	}
11228 
11229 	if_sndq_maxlen = i;
11230 	return err;
11231 }
11232 
11233 static int
11234 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11235 {
11236 #pragma unused(arg1, arg2)
11237 	int i, err;
11238 
11239 	i = if_rcvq_maxlen;
11240 
11241 	err = sysctl_handle_int(oidp, &i, 0, req);
11242 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11243 		return err;
11244 	}
11245 
11246 	if (i < IF_RCVQ_MINLEN) {
11247 		i = IF_RCVQ_MINLEN;
11248 	}
11249 
11250 	if_rcvq_maxlen = i;
11251 	return err;
11252 }
11253 
11254 static int
11255 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11256 {
11257 #pragma unused(arg1, arg2)
11258 	int i, err;
11259 
11260 	i = if_rcvq_burst_limit;
11261 
11262 	err = sysctl_handle_int(oidp, &i, 0, req);
11263 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11264 		return err;
11265 	}
11266 
11267 /*
11268  * Safeguard the burst limit to "sane" values on customer builds.
11269  */
11270 #if !(DEVELOPMENT || DEBUG)
11271 	if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11272 		i = IF_RCVQ_BURST_LIMIT_MIN;
11273 	}
11274 
11275 	if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11276 		i = IF_RCVQ_BURST_LIMIT_MAX;
11277 	}
11278 #endif
11279 
11280 	if_rcvq_burst_limit = i;
11281 	return err;
11282 }
11283 
11284 static int
11285 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11286 {
11287 #pragma unused(arg1, arg2)
11288 	int i, err;
11289 
11290 	i = if_rcvq_burst_limit;
11291 
11292 	err = sysctl_handle_int(oidp, &i, 0, req);
11293 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11294 		return err;
11295 	}
11296 
11297 	if (IF_RCVQ_TRIM_PCT_MAX < i) {
11298 		i = IF_RCVQ_TRIM_PCT_MAX;
11299 	}
11300 
11301 	if (i < IF_RCVQ_TRIM_PCT_MIN) {
11302 		i = IF_RCVQ_TRIM_PCT_MIN;
11303 	}
11304 
11305 	if_rcvq_trim_pct = i;
11306 	return err;
11307 }
11308 
11309 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11310 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11311     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11312 {
11313 	struct kev_dl_node_presence kev;
11314 	struct sockaddr_dl *sdl;
11315 	struct sockaddr_in6 *sin6;
11316 	int ret = 0;
11317 
11318 	VERIFY(ifp);
11319 	VERIFY(sa);
11320 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11321 
11322 	bzero(&kev, sizeof(kev));
11323 	sin6 = &kev.sin6_node_address;
11324 	sdl = &kev.sdl_node_address;
11325 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11326 	kev.rssi = rssi;
11327 	kev.link_quality_metric = lqm;
11328 	kev.node_proximity_metric = npm;
11329 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11330 
11331 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11332 	if (ret == 0 || ret == EEXIST) {
11333 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11334 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11335 		if (err != 0) {
11336 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11337 			    "error %d\n", __func__, err);
11338 		}
11339 	}
11340 
11341 	if (ret == EEXIST) {
11342 		ret = 0;
11343 	}
11344 	return ret;
11345 }
11346 
11347 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11348 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11349 {
11350 	struct kev_dl_node_absence kev = {};
11351 	struct sockaddr_in6 *kev_sin6 = NULL;
11352 	struct sockaddr_dl *kev_sdl = NULL;
11353 	int error = 0;
11354 
11355 	VERIFY(ifp != NULL);
11356 	VERIFY(sa != NULL);
11357 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11358 
11359 	kev_sin6 = &kev.sin6_node_address;
11360 	kev_sdl = &kev.sdl_node_address;
11361 
11362 	if (sa->sa_family == AF_INET6) {
11363 		/*
11364 		 * If IPv6 address is given, get the link layer
11365 		 * address from what was cached in the neighbor cache
11366 		 */
11367 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11368 		bcopy(sa, kev_sin6, sa->sa_len);
11369 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11370 	} else {
11371 		/*
11372 		 * If passed address is AF_LINK type, derive the address
11373 		 * based on the link address.
11374 		 */
11375 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11376 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11377 	}
11378 
11379 	if (error == 0) {
11380 		kev_sdl->sdl_type = ifp->if_type;
11381 		kev_sdl->sdl_index = ifp->if_index;
11382 
11383 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11384 		    &kev.link_data, sizeof(kev), FALSE);
11385 	}
11386 }
11387 
11388 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11389 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11390     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11391 {
11392 	struct kev_dl_node_presence kev = {};
11393 	struct sockaddr_dl *kev_sdl = NULL;
11394 	struct sockaddr_in6 *kev_sin6 = NULL;
11395 	int ret = 0;
11396 
11397 	VERIFY(ifp != NULL);
11398 	VERIFY(sa != NULL && sdl != NULL);
11399 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11400 
11401 	kev_sin6 = &kev.sin6_node_address;
11402 	kev_sdl = &kev.sdl_node_address;
11403 
11404 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11405 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11406 	kev_sdl->sdl_type = ifp->if_type;
11407 	kev_sdl->sdl_index = ifp->if_index;
11408 
11409 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11410 	bcopy(sa, kev_sin6, sa->sa_len);
11411 
11412 	kev.rssi = rssi;
11413 	kev.link_quality_metric = lqm;
11414 	kev.node_proximity_metric = npm;
11415 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11416 
11417 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11418 	if (ret == 0 || ret == EEXIST) {
11419 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11420 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11421 		if (err != 0) {
11422 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11423 		}
11424 	}
11425 
11426 	if (ret == EEXIST) {
11427 		ret = 0;
11428 	}
11429 	return ret;
11430 }
11431 
11432 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11433 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11434     kauth_cred_t *credp)
11435 {
11436 	const u_int8_t *bytes;
11437 	size_t size;
11438 
11439 	bytes = CONST_LLADDR(sdl);
11440 	size = sdl->sdl_alen;
11441 
11442 #if CONFIG_MACF
11443 	if (dlil_lladdr_ckreq) {
11444 		switch (sdl->sdl_type) {
11445 		case IFT_ETHER:
11446 		case IFT_IEEE1394:
11447 			break;
11448 		default:
11449 			credp = NULL;
11450 			break;
11451 		}
11452 		;
11453 
11454 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11455 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11456 				[0] = 2
11457 			};
11458 
11459 			bytes = unspec;
11460 		}
11461 	}
11462 #else
11463 #pragma unused(credp)
11464 #endif
11465 
11466 	if (sizep != NULL) {
11467 		*sizep = size;
11468 	}
11469 	return bytes;
11470 }
11471 
11472 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11473 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11474     u_int8_t info[DLIL_MODARGLEN])
11475 {
11476 	struct kev_dl_issues kev;
11477 	struct timeval tv;
11478 
11479 	VERIFY(ifp != NULL);
11480 	VERIFY(modid != NULL);
11481 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11482 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11483 
11484 	bzero(&kev, sizeof(kev));
11485 
11486 	microtime(&tv);
11487 	kev.timestamp = tv.tv_sec;
11488 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11489 	if (info != NULL) {
11490 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11491 	}
11492 
11493 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11494 	    &kev.link_data, sizeof(kev), FALSE);
11495 }
11496 
11497 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11498 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11499     struct proc *p)
11500 {
11501 	u_int32_t level = IFNET_THROTTLE_OFF;
11502 	errno_t result = 0;
11503 
11504 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11505 
11506 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11507 		/*
11508 		 * XXX: Use priv_check_cred() instead of root check?
11509 		 */
11510 		if ((result = proc_suser(p)) != 0) {
11511 			return result;
11512 		}
11513 
11514 		if (ifr->ifr_opportunistic.ifo_flags ==
11515 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11516 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11517 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11518 			level = IFNET_THROTTLE_OFF;
11519 		} else {
11520 			result = EINVAL;
11521 		}
11522 
11523 		if (result == 0) {
11524 			result = ifnet_set_throttle(ifp, level);
11525 		}
11526 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11527 		ifr->ifr_opportunistic.ifo_flags = 0;
11528 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11529 			ifr->ifr_opportunistic.ifo_flags |=
11530 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11531 		}
11532 	}
11533 
11534 	/*
11535 	 * Return the count of current opportunistic connections
11536 	 * over the interface.
11537 	 */
11538 	if (result == 0) {
11539 		uint32_t flags = 0;
11540 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11541 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11542 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11543 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11544 		ifr->ifr_opportunistic.ifo_inuse =
11545 		    udp_count_opportunistic(ifp->if_index, flags) +
11546 		    tcp_count_opportunistic(ifp->if_index, flags);
11547 	}
11548 
11549 	if (result == EALREADY) {
11550 		result = 0;
11551 	}
11552 
11553 	return result;
11554 }
11555 
11556 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11557 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11558 {
11559 	struct ifclassq *ifq;
11560 	int err = 0;
11561 
11562 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11563 		return ENXIO;
11564 	}
11565 
11566 	*level = IFNET_THROTTLE_OFF;
11567 
11568 	ifq = ifp->if_snd;
11569 	IFCQ_LOCK(ifq);
11570 	/* Throttling works only for IFCQ, not ALTQ instances */
11571 	if (IFCQ_IS_ENABLED(ifq)) {
11572 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11573 
11574 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11575 		*level = req.level;
11576 	}
11577 	IFCQ_UNLOCK(ifq);
11578 
11579 	return err;
11580 }
11581 
11582 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11583 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11584 {
11585 	struct ifclassq *ifq;
11586 	int err = 0;
11587 
11588 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11589 		return ENXIO;
11590 	}
11591 
11592 	ifq = ifp->if_snd;
11593 
11594 	switch (level) {
11595 	case IFNET_THROTTLE_OFF:
11596 	case IFNET_THROTTLE_OPPORTUNISTIC:
11597 		break;
11598 	default:
11599 		return EINVAL;
11600 	}
11601 
11602 	IFCQ_LOCK(ifq);
11603 	if (IFCQ_IS_ENABLED(ifq)) {
11604 		cqrq_throttle_t req = { 1, level };
11605 
11606 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11607 	}
11608 	IFCQ_UNLOCK(ifq);
11609 
11610 	if (err == 0) {
11611 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11612 		    level);
11613 #if NECP
11614 		necp_update_all_clients();
11615 #endif /* NECP */
11616 		if (level == IFNET_THROTTLE_OFF) {
11617 			ifnet_start(ifp);
11618 		}
11619 	}
11620 
11621 	return err;
11622 }
11623 
11624 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11625 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11626     struct proc *p)
11627 {
11628 #pragma unused(p)
11629 	errno_t result = 0;
11630 	uint32_t flags;
11631 	int level, category, subcategory;
11632 
11633 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11634 
11635 	if (cmd == SIOCSIFLOG) {
11636 		if ((result = priv_check_cred(kauth_cred_get(),
11637 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11638 			return result;
11639 		}
11640 
11641 		level = ifr->ifr_log.ifl_level;
11642 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11643 			result = EINVAL;
11644 		}
11645 
11646 		flags = ifr->ifr_log.ifl_flags;
11647 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11648 			result = EINVAL;
11649 		}
11650 
11651 		category = ifr->ifr_log.ifl_category;
11652 		subcategory = ifr->ifr_log.ifl_subcategory;
11653 
11654 		if (result == 0) {
11655 			result = ifnet_set_log(ifp, level, flags,
11656 			    category, subcategory);
11657 		}
11658 	} else {
11659 		result = ifnet_get_log(ifp, &level, &flags, &category,
11660 		    &subcategory);
11661 		if (result == 0) {
11662 			ifr->ifr_log.ifl_level = level;
11663 			ifr->ifr_log.ifl_flags = flags;
11664 			ifr->ifr_log.ifl_category = category;
11665 			ifr->ifr_log.ifl_subcategory = subcategory;
11666 		}
11667 	}
11668 
11669 	return result;
11670 }
11671 
11672 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11673 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11674     int32_t category, int32_t subcategory)
11675 {
11676 	int err = 0;
11677 
11678 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11679 	VERIFY(flags & IFNET_LOGF_MASK);
11680 
11681 	/*
11682 	 * The logging level applies to all facilities; make sure to
11683 	 * update them all with the most current level.
11684 	 */
11685 	flags |= ifp->if_log.flags;
11686 
11687 	if (ifp->if_output_ctl != NULL) {
11688 		struct ifnet_log_params l;
11689 
11690 		bzero(&l, sizeof(l));
11691 		l.level = level;
11692 		l.flags = flags;
11693 		l.flags &= ~IFNET_LOGF_DLIL;
11694 		l.category = category;
11695 		l.subcategory = subcategory;
11696 
11697 		/* Send this request to lower layers */
11698 		if (l.flags != 0) {
11699 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11700 			    sizeof(l), &l);
11701 		}
11702 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11703 		/*
11704 		 * If targeted to the lower layers without an output
11705 		 * control callback registered on the interface, just
11706 		 * silently ignore facilities other than ours.
11707 		 */
11708 		flags &= IFNET_LOGF_DLIL;
11709 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11710 			level = 0;
11711 		}
11712 	}
11713 
11714 	if (err == 0) {
11715 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11716 			ifp->if_log.flags = 0;
11717 		} else {
11718 			ifp->if_log.flags |= flags;
11719 		}
11720 
11721 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11722 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11723 		    ifp->if_log.level, ifp->if_log.flags,
11724 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11725 		    category, subcategory);
11726 	}
11727 
11728 	return err;
11729 }
11730 
11731 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11732 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11733     int32_t *category, int32_t *subcategory)
11734 {
11735 	if (level != NULL) {
11736 		*level = ifp->if_log.level;
11737 	}
11738 	if (flags != NULL) {
11739 		*flags = ifp->if_log.flags;
11740 	}
11741 	if (category != NULL) {
11742 		*category = ifp->if_log.category;
11743 	}
11744 	if (subcategory != NULL) {
11745 		*subcategory = ifp->if_log.subcategory;
11746 	}
11747 
11748 	return 0;
11749 }
11750 
11751 int
ifnet_notify_address(struct ifnet * ifp,int af)11752 ifnet_notify_address(struct ifnet *ifp, int af)
11753 {
11754 	struct ifnet_notify_address_params na;
11755 
11756 #if PF
11757 	(void) pf_ifaddr_hook(ifp);
11758 #endif /* PF */
11759 
11760 	if (ifp->if_output_ctl == NULL) {
11761 		return EOPNOTSUPP;
11762 	}
11763 
11764 	bzero(&na, sizeof(na));
11765 	na.address_family = (sa_family_t)af;
11766 
11767 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11768 	           sizeof(na), &na);
11769 }
11770 
11771 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11772 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11773 {
11774 	if (ifp == NULL || flowid == NULL) {
11775 		return EINVAL;
11776 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11777 	    !IF_FULLY_ATTACHED(ifp)) {
11778 		return ENXIO;
11779 	}
11780 
11781 	*flowid = ifp->if_flowhash;
11782 
11783 	return 0;
11784 }
11785 
11786 errno_t
ifnet_disable_output(struct ifnet * ifp)11787 ifnet_disable_output(struct ifnet *ifp)
11788 {
11789 	int err;
11790 
11791 	if (ifp == NULL) {
11792 		return EINVAL;
11793 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11794 	    !IF_FULLY_ATTACHED(ifp)) {
11795 		return ENXIO;
11796 	}
11797 
11798 	if ((err = ifnet_fc_add(ifp)) == 0) {
11799 		lck_mtx_lock_spin(&ifp->if_start_lock);
11800 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11801 		lck_mtx_unlock(&ifp->if_start_lock);
11802 	}
11803 	return err;
11804 }
11805 
11806 errno_t
ifnet_enable_output(struct ifnet * ifp)11807 ifnet_enable_output(struct ifnet *ifp)
11808 {
11809 	if (ifp == NULL) {
11810 		return EINVAL;
11811 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11812 	    !IF_FULLY_ATTACHED(ifp)) {
11813 		return ENXIO;
11814 	}
11815 
11816 	ifnet_start_common(ifp, TRUE, FALSE);
11817 	return 0;
11818 }
11819 
11820 void
ifnet_flowadv(uint32_t flowhash)11821 ifnet_flowadv(uint32_t flowhash)
11822 {
11823 	struct ifnet_fc_entry *ifce;
11824 	struct ifnet *ifp;
11825 
11826 	ifce = ifnet_fc_get(flowhash);
11827 	if (ifce == NULL) {
11828 		return;
11829 	}
11830 
11831 	VERIFY(ifce->ifce_ifp != NULL);
11832 	ifp = ifce->ifce_ifp;
11833 
11834 	/* flow hash gets recalculated per attach, so check */
11835 	if (ifnet_is_attached(ifp, 1)) {
11836 		if (ifp->if_flowhash == flowhash) {
11837 			(void) ifnet_enable_output(ifp);
11838 		}
11839 		ifnet_decr_iorefcnt(ifp);
11840 	}
11841 	ifnet_fc_entry_free(ifce);
11842 }
11843 
11844 /*
11845  * Function to compare ifnet_fc_entries in ifnet flow control tree
11846  */
11847 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11848 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11849 {
11850 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11851 }
11852 
11853 static int
ifnet_fc_add(struct ifnet * ifp)11854 ifnet_fc_add(struct ifnet *ifp)
11855 {
11856 	struct ifnet_fc_entry keyfc, *ifce;
11857 	uint32_t flowhash;
11858 
11859 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11860 	VERIFY(ifp->if_flowhash != 0);
11861 	flowhash = ifp->if_flowhash;
11862 
11863 	bzero(&keyfc, sizeof(keyfc));
11864 	keyfc.ifce_flowhash = flowhash;
11865 
11866 	lck_mtx_lock_spin(&ifnet_fc_lock);
11867 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11868 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11869 		/* Entry is already in ifnet_fc_tree, return */
11870 		lck_mtx_unlock(&ifnet_fc_lock);
11871 		return 0;
11872 	}
11873 
11874 	if (ifce != NULL) {
11875 		/*
11876 		 * There is a different fc entry with the same flow hash
11877 		 * but different ifp pointer.  There can be a collision
11878 		 * on flow hash but the probability is low.  Let's just
11879 		 * avoid adding a second one when there is a collision.
11880 		 */
11881 		lck_mtx_unlock(&ifnet_fc_lock);
11882 		return EAGAIN;
11883 	}
11884 
11885 	/* become regular mutex */
11886 	lck_mtx_convert_spin(&ifnet_fc_lock);
11887 
11888 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11889 	ifce->ifce_flowhash = flowhash;
11890 	ifce->ifce_ifp = ifp;
11891 
11892 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11893 	lck_mtx_unlock(&ifnet_fc_lock);
11894 	return 0;
11895 }
11896 
11897 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11898 ifnet_fc_get(uint32_t flowhash)
11899 {
11900 	struct ifnet_fc_entry keyfc, *ifce;
11901 	struct ifnet *ifp;
11902 
11903 	bzero(&keyfc, sizeof(keyfc));
11904 	keyfc.ifce_flowhash = flowhash;
11905 
11906 	lck_mtx_lock_spin(&ifnet_fc_lock);
11907 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11908 	if (ifce == NULL) {
11909 		/* Entry is not present in ifnet_fc_tree, return */
11910 		lck_mtx_unlock(&ifnet_fc_lock);
11911 		return NULL;
11912 	}
11913 
11914 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11915 
11916 	VERIFY(ifce->ifce_ifp != NULL);
11917 	ifp = ifce->ifce_ifp;
11918 
11919 	/* become regular mutex */
11920 	lck_mtx_convert_spin(&ifnet_fc_lock);
11921 
11922 	if (!ifnet_is_attached(ifp, 0)) {
11923 		/*
11924 		 * This ifp is not attached or in the process of being
11925 		 * detached; just don't process it.
11926 		 */
11927 		ifnet_fc_entry_free(ifce);
11928 		ifce = NULL;
11929 	}
11930 	lck_mtx_unlock(&ifnet_fc_lock);
11931 
11932 	return ifce;
11933 }
11934 
11935 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11936 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11937 {
11938 	zfree(ifnet_fc_zone, ifce);
11939 }
11940 
11941 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11942 ifnet_calc_flowhash(struct ifnet *ifp)
11943 {
11944 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11945 	uint32_t flowhash = 0;
11946 
11947 	if (ifnet_flowhash_seed == 0) {
11948 		ifnet_flowhash_seed = RandomULong();
11949 	}
11950 
11951 	bzero(&fh, sizeof(fh));
11952 
11953 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11954 	fh.ifk_unit = ifp->if_unit;
11955 	fh.ifk_flags = ifp->if_flags;
11956 	fh.ifk_eflags = ifp->if_eflags;
11957 	fh.ifk_capabilities = ifp->if_capabilities;
11958 	fh.ifk_capenable = ifp->if_capenable;
11959 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11960 	fh.ifk_rand1 = RandomULong();
11961 	fh.ifk_rand2 = RandomULong();
11962 
11963 try_again:
11964 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11965 	if (flowhash == 0) {
11966 		/* try to get a non-zero flowhash */
11967 		ifnet_flowhash_seed = RandomULong();
11968 		goto try_again;
11969 	}
11970 
11971 	return flowhash;
11972 }
11973 
11974 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11975 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11976     uint16_t flags, uint8_t *data)
11977 {
11978 #pragma unused(flags)
11979 	int error = 0;
11980 
11981 	switch (family) {
11982 	case AF_INET:
11983 		if_inetdata_lock_exclusive(ifp);
11984 		if (IN_IFEXTRA(ifp) != NULL) {
11985 			if (len == 0) {
11986 				/* Allow clearing the signature */
11987 				IN_IFEXTRA(ifp)->netsig_len = 0;
11988 				bzero(IN_IFEXTRA(ifp)->netsig,
11989 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11990 				if_inetdata_lock_done(ifp);
11991 				break;
11992 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11993 				error = EINVAL;
11994 				if_inetdata_lock_done(ifp);
11995 				break;
11996 			}
11997 			IN_IFEXTRA(ifp)->netsig_len = len;
11998 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11999 		} else {
12000 			error = ENOMEM;
12001 		}
12002 		if_inetdata_lock_done(ifp);
12003 		break;
12004 
12005 	case AF_INET6:
12006 		if_inet6data_lock_exclusive(ifp);
12007 		if (IN6_IFEXTRA(ifp) != NULL) {
12008 			if (len == 0) {
12009 				/* Allow clearing the signature */
12010 				IN6_IFEXTRA(ifp)->netsig_len = 0;
12011 				bzero(IN6_IFEXTRA(ifp)->netsig,
12012 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
12013 				if_inet6data_lock_done(ifp);
12014 				break;
12015 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12016 				error = EINVAL;
12017 				if_inet6data_lock_done(ifp);
12018 				break;
12019 			}
12020 			IN6_IFEXTRA(ifp)->netsig_len = len;
12021 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12022 		} else {
12023 			error = ENOMEM;
12024 		}
12025 		if_inet6data_lock_done(ifp);
12026 		break;
12027 
12028 	default:
12029 		error = EINVAL;
12030 		break;
12031 	}
12032 
12033 	return error;
12034 }
12035 
12036 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12037 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12038     uint16_t *flags, uint8_t *data)
12039 {
12040 	int error = 0;
12041 
12042 	if (ifp == NULL || len == NULL || data == NULL) {
12043 		return EINVAL;
12044 	}
12045 
12046 	switch (family) {
12047 	case AF_INET:
12048 		if_inetdata_lock_shared(ifp);
12049 		if (IN_IFEXTRA(ifp) != NULL) {
12050 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12051 				error = EINVAL;
12052 				if_inetdata_lock_done(ifp);
12053 				break;
12054 			}
12055 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12056 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12057 			} else {
12058 				error = ENOENT;
12059 			}
12060 		} else {
12061 			error = ENOMEM;
12062 		}
12063 		if_inetdata_lock_done(ifp);
12064 		break;
12065 
12066 	case AF_INET6:
12067 		if_inet6data_lock_shared(ifp);
12068 		if (IN6_IFEXTRA(ifp) != NULL) {
12069 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12070 				error = EINVAL;
12071 				if_inet6data_lock_done(ifp);
12072 				break;
12073 			}
12074 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12075 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12076 			} else {
12077 				error = ENOENT;
12078 			}
12079 		} else {
12080 			error = ENOMEM;
12081 		}
12082 		if_inet6data_lock_done(ifp);
12083 		break;
12084 
12085 	default:
12086 		error = EINVAL;
12087 		break;
12088 	}
12089 
12090 	if (error == 0 && flags != NULL) {
12091 		*flags = 0;
12092 	}
12093 
12094 	return error;
12095 }
12096 
12097 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12098 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12099 {
12100 	int i, error = 0, one_set = 0;
12101 
12102 	if_inet6data_lock_exclusive(ifp);
12103 
12104 	if (IN6_IFEXTRA(ifp) == NULL) {
12105 		error = ENOMEM;
12106 		goto out;
12107 	}
12108 
12109 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12110 		uint32_t prefix_len =
12111 		    prefixes[i].prefix_len;
12112 		struct in6_addr *prefix =
12113 		    &prefixes[i].ipv6_prefix;
12114 
12115 		if (prefix_len == 0) {
12116 			clat_log0((LOG_DEBUG,
12117 			    "NAT64 prefixes purged from Interface %s\n",
12118 			    if_name(ifp)));
12119 			/* Allow clearing the signature */
12120 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12121 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12122 			    sizeof(struct in6_addr));
12123 
12124 			continue;
12125 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12126 		    prefix_len != NAT64_PREFIX_LEN_40 &&
12127 		    prefix_len != NAT64_PREFIX_LEN_48 &&
12128 		    prefix_len != NAT64_PREFIX_LEN_56 &&
12129 		    prefix_len != NAT64_PREFIX_LEN_64 &&
12130 		    prefix_len != NAT64_PREFIX_LEN_96) {
12131 			clat_log0((LOG_DEBUG,
12132 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
12133 			error = EINVAL;
12134 			goto out;
12135 		}
12136 
12137 		if (IN6_IS_SCOPE_EMBED(prefix)) {
12138 			clat_log0((LOG_DEBUG,
12139 			    "NAT64 prefix has interface/link local scope.\n"));
12140 			error = EINVAL;
12141 			goto out;
12142 		}
12143 
12144 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12145 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12146 		    sizeof(struct in6_addr));
12147 		clat_log0((LOG_DEBUG,
12148 		    "NAT64 prefix set to %s with prefixlen: %d\n",
12149 		    ip6_sprintf(prefix), prefix_len));
12150 		one_set = 1;
12151 	}
12152 
12153 out:
12154 	if_inet6data_lock_done(ifp);
12155 
12156 	if (error == 0 && one_set != 0) {
12157 		necp_update_all_clients();
12158 	}
12159 
12160 	return error;
12161 }
12162 
12163 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12164 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12165 {
12166 	int i, found_one = 0, error = 0;
12167 
12168 	if (ifp == NULL) {
12169 		return EINVAL;
12170 	}
12171 
12172 	if_inet6data_lock_shared(ifp);
12173 
12174 	if (IN6_IFEXTRA(ifp) == NULL) {
12175 		error = ENOMEM;
12176 		goto out;
12177 	}
12178 
12179 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12180 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12181 			found_one = 1;
12182 		}
12183 	}
12184 
12185 	if (found_one == 0) {
12186 		error = ENOENT;
12187 		goto out;
12188 	}
12189 
12190 	if (prefixes) {
12191 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12192 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12193 	}
12194 
12195 out:
12196 	if_inet6data_lock_done(ifp);
12197 
12198 	return error;
12199 }
12200 
12201 __attribute__((noinline))
12202 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12203 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12204     protocol_family_t pf)
12205 {
12206 #pragma unused(ifp)
12207 	uint32_t did_sw;
12208 
12209 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12210 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12211 		return;
12212 	}
12213 
12214 	switch (pf) {
12215 	case PF_INET:
12216 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12217 		if (did_sw & CSUM_DELAY_IP) {
12218 			hwcksum_dbg_finalized_hdr++;
12219 		}
12220 		if (did_sw & CSUM_DELAY_DATA) {
12221 			hwcksum_dbg_finalized_data++;
12222 		}
12223 		break;
12224 	case PF_INET6:
12225 		/*
12226 		 * Checksum offload should not have been enabled when
12227 		 * extension headers exist; that also means that we
12228 		 * cannot force-finalize packets with extension headers.
12229 		 * Indicate to the callee should it skip such case by
12230 		 * setting optlen to -1.
12231 		 */
12232 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12233 		    m->m_pkthdr.csum_flags);
12234 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
12235 			hwcksum_dbg_finalized_data++;
12236 		}
12237 		break;
12238 	default:
12239 		return;
12240 	}
12241 }
12242 
12243 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12244 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12245     protocol_family_t pf)
12246 {
12247 	uint16_t sum = 0;
12248 	uint32_t hlen;
12249 
12250 	if (frame_header == NULL ||
12251 	    frame_header < (char *)mbuf_datastart(m) ||
12252 	    frame_header > (char *)m->m_data) {
12253 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12254 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12255 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12256 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12257 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12258 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
12259 		return;
12260 	}
12261 	hlen = (uint32_t)(m->m_data - frame_header);
12262 
12263 	switch (pf) {
12264 	case PF_INET:
12265 	case PF_INET6:
12266 		break;
12267 	default:
12268 		return;
12269 	}
12270 
12271 	/*
12272 	 * Force partial checksum offload; useful to simulate cases
12273 	 * where the hardware does not support partial checksum offload,
12274 	 * in order to validate correctness throughout the layers above.
12275 	 */
12276 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12277 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12278 
12279 		if (foff > (uint32_t)m->m_pkthdr.len) {
12280 			return;
12281 		}
12282 
12283 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12284 
12285 		/* Compute 16-bit 1's complement sum from forced offset */
12286 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12287 
12288 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12289 		m->m_pkthdr.csum_rx_val = sum;
12290 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12291 
12292 		hwcksum_dbg_partial_forced++;
12293 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12294 	}
12295 
12296 	/*
12297 	 * Partial checksum offload verification (and adjustment);
12298 	 * useful to validate and test cases where the hardware
12299 	 * supports partial checksum offload.
12300 	 */
12301 	if ((m->m_pkthdr.csum_flags &
12302 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12303 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12304 		uint32_t rxoff;
12305 
12306 		/* Start offset must begin after frame header */
12307 		rxoff = m->m_pkthdr.csum_rx_start;
12308 		if (hlen > rxoff) {
12309 			hwcksum_dbg_bad_rxoff++;
12310 			if (dlil_verbose) {
12311 				DLIL_PRINTF("%s: partial cksum start offset %d "
12312 				    "is less than frame header length %d for "
12313 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12314 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
12315 			}
12316 			return;
12317 		}
12318 		rxoff -= hlen;
12319 
12320 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12321 			/*
12322 			 * Compute the expected 16-bit 1's complement sum;
12323 			 * skip this if we've already computed it above
12324 			 * when partial checksum offload is forced.
12325 			 */
12326 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12327 
12328 			/* Hardware or driver is buggy */
12329 			if (sum != m->m_pkthdr.csum_rx_val) {
12330 				hwcksum_dbg_bad_cksum++;
12331 				if (dlil_verbose) {
12332 					DLIL_PRINTF("%s: bad partial cksum value "
12333 					    "0x%x (expected 0x%x) for mbuf "
12334 					    "0x%llx [rx_start %d]\n",
12335 					    if_name(ifp),
12336 					    m->m_pkthdr.csum_rx_val, sum,
12337 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
12338 					    m->m_pkthdr.csum_rx_start);
12339 				}
12340 				return;
12341 			}
12342 		}
12343 		hwcksum_dbg_verified++;
12344 
12345 		/*
12346 		 * This code allows us to emulate various hardwares that
12347 		 * perform 16-bit 1's complement sum beginning at various
12348 		 * start offset values.
12349 		 */
12350 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12351 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12352 
12353 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12354 				return;
12355 			}
12356 
12357 			sum = m_adj_sum16(m, rxoff, aoff,
12358 			    m_pktlen(m) - aoff, sum);
12359 
12360 			m->m_pkthdr.csum_rx_val = sum;
12361 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12362 
12363 			hwcksum_dbg_adjusted++;
12364 		}
12365 	}
12366 }
12367 
12368 static int
12369 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12370 {
12371 #pragma unused(arg1, arg2)
12372 	u_int32_t i;
12373 	int err;
12374 
12375 	i = hwcksum_dbg_mode;
12376 
12377 	err = sysctl_handle_int(oidp, &i, 0, req);
12378 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12379 		return err;
12380 	}
12381 
12382 	if (hwcksum_dbg == 0) {
12383 		return ENODEV;
12384 	}
12385 
12386 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12387 		return EINVAL;
12388 	}
12389 
12390 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12391 
12392 	return err;
12393 }
12394 
12395 static int
12396 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12397 {
12398 #pragma unused(arg1, arg2)
12399 	u_int32_t i;
12400 	int err;
12401 
12402 	i = hwcksum_dbg_partial_rxoff_forced;
12403 
12404 	err = sysctl_handle_int(oidp, &i, 0, req);
12405 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12406 		return err;
12407 	}
12408 
12409 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12410 		return ENODEV;
12411 	}
12412 
12413 	hwcksum_dbg_partial_rxoff_forced = i;
12414 
12415 	return err;
12416 }
12417 
12418 static int
12419 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12420 {
12421 #pragma unused(arg1, arg2)
12422 	u_int32_t i;
12423 	int err;
12424 
12425 	i = hwcksum_dbg_partial_rxoff_adj;
12426 
12427 	err = sysctl_handle_int(oidp, &i, 0, req);
12428 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12429 		return err;
12430 	}
12431 
12432 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12433 		return ENODEV;
12434 	}
12435 
12436 	hwcksum_dbg_partial_rxoff_adj = i;
12437 
12438 	return err;
12439 }
12440 
12441 static int
12442 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12443 {
12444 #pragma unused(oidp, arg1, arg2)
12445 	int err;
12446 
12447 	if (req->oldptr == USER_ADDR_NULL) {
12448 	}
12449 	if (req->newptr != USER_ADDR_NULL) {
12450 		return EPERM;
12451 	}
12452 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12453 	    sizeof(struct chain_len_stats));
12454 
12455 	return err;
12456 }
12457 
12458 #if DEBUG || DEVELOPMENT
12459 /* Blob for sum16 verification */
12460 static uint8_t sumdata[] = {
12461 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12462 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12463 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12464 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12465 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12466 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12467 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12468 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12469 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12470 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12471 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12472 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12473 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12474 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12475 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12476 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12477 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12478 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12479 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12480 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12481 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12482 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12483 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12484 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12485 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12486 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12487 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12488 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12489 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12490 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12491 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12492 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12493 	0xc8, 0x28, 0x02, 0x00, 0x00
12494 };
12495 
12496 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12497 static struct {
12498 	boolean_t       init;
12499 	uint16_t        len;
12500 	uint16_t        sumr;   /* reference */
12501 	uint16_t        sumrp;  /* reference, precomputed */
12502 } sumtbl[] = {
12503 	{ FALSE, 0, 0, 0x0000 },
12504 	{ FALSE, 1, 0, 0x001f },
12505 	{ FALSE, 2, 0, 0x8b1f },
12506 	{ FALSE, 3, 0, 0x8b27 },
12507 	{ FALSE, 7, 0, 0x790e },
12508 	{ FALSE, 11, 0, 0xcb6d },
12509 	{ FALSE, 20, 0, 0x20dd },
12510 	{ FALSE, 27, 0, 0xbabd },
12511 	{ FALSE, 32, 0, 0xf3e8 },
12512 	{ FALSE, 37, 0, 0x197d },
12513 	{ FALSE, 43, 0, 0x9eae },
12514 	{ FALSE, 64, 0, 0x4678 },
12515 	{ FALSE, 127, 0, 0x9399 },
12516 	{ FALSE, 256, 0, 0xd147 },
12517 	{ FALSE, 325, 0, 0x0358 },
12518 };
12519 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12520 
12521 static void
dlil_verify_sum16(void)12522 dlil_verify_sum16(void)
12523 {
12524 	struct mbuf *m;
12525 	uint8_t *buf;
12526 	int n;
12527 
12528 	/* Make sure test data plus extra room for alignment fits in cluster */
12529 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12530 
12531 	kprintf("DLIL: running SUM16 self-tests ... ");
12532 
12533 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12534 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12535 
12536 	buf = mtod(m, uint8_t *);               /* base address */
12537 
12538 	for (n = 0; n < SUMTBL_MAX; n++) {
12539 		uint16_t len = sumtbl[n].len;
12540 		int i;
12541 
12542 		/* Verify for all possible alignments */
12543 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12544 			uint16_t sum, sumr;
12545 			uint8_t *c;
12546 
12547 			/* Copy over test data to mbuf */
12548 			VERIFY(len <= sizeof(sumdata));
12549 			c = buf + i;
12550 			bcopy(sumdata, c, len);
12551 
12552 			/* Zero-offset test (align by data pointer) */
12553 			m->m_data = (caddr_t)c;
12554 			m->m_len = len;
12555 			sum = m_sum16(m, 0, len);
12556 
12557 			if (!sumtbl[n].init) {
12558 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12559 				sumtbl[n].sumr = sumr;
12560 				sumtbl[n].init = TRUE;
12561 			} else {
12562 				sumr = sumtbl[n].sumr;
12563 			}
12564 
12565 			/* Something is horribly broken; stop now */
12566 			if (sumr != sumtbl[n].sumrp) {
12567 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12568 				    "for len=%d align=%d sum=0x%04x "
12569 				    "[expected=0x%04x]\n", __func__,
12570 				    len, i, sum, sumr);
12571 				/* NOTREACHED */
12572 			} else if (sum != sumr) {
12573 				panic_plain("\n%s: broken m_sum16() for len=%d "
12574 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12575 				    __func__, len, i, sum, sumr);
12576 				/* NOTREACHED */
12577 			}
12578 
12579 			/* Alignment test by offset (fixed data pointer) */
12580 			m->m_data = (caddr_t)buf;
12581 			m->m_len = i + len;
12582 			sum = m_sum16(m, i, len);
12583 
12584 			/* Something is horribly broken; stop now */
12585 			if (sum != sumr) {
12586 				panic_plain("\n%s: broken m_sum16() for len=%d "
12587 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12588 				    __func__, len, i, sum, sumr);
12589 				/* NOTREACHED */
12590 			}
12591 #if INET
12592 			/* Simple sum16 contiguous buffer test by aligment */
12593 			sum = b_sum16(c, len);
12594 
12595 			/* Something is horribly broken; stop now */
12596 			if (sum != sumr) {
12597 				panic_plain("\n%s: broken b_sum16() for len=%d "
12598 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12599 				    __func__, len, i, sum, sumr);
12600 				/* NOTREACHED */
12601 			}
12602 #endif /* INET */
12603 		}
12604 	}
12605 	m_freem(m);
12606 
12607 	kprintf("PASSED\n");
12608 }
12609 #endif /* DEBUG || DEVELOPMENT */
12610 
12611 #define CASE_STRINGIFY(x) case x: return #x
12612 
12613 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12614 dlil_kev_dl_code_str(u_int32_t event_code)
12615 {
12616 	switch (event_code) {
12617 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12618 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12619 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12620 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12621 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12622 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12623 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12624 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12625 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12626 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12627 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12628 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12629 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12630 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12631 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12632 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12633 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12634 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12635 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12636 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12637 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12638 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12639 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12640 		CASE_STRINGIFY(KEV_DL_ISSUES);
12641 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12642 	default:
12643 		break;
12644 	}
12645 	return "";
12646 }
12647 
12648 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12649 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12650 {
12651 #pragma unused(arg1)
12652 	struct ifnet *ifp = arg0;
12653 
12654 	if (ifnet_is_attached(ifp, 1)) {
12655 		nstat_ifnet_threshold_reached(ifp->if_index);
12656 		ifnet_decr_iorefcnt(ifp);
12657 	}
12658 }
12659 
12660 void
ifnet_notify_data_threshold(struct ifnet * ifp)12661 ifnet_notify_data_threshold(struct ifnet *ifp)
12662 {
12663 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12664 	uint64_t oldbytes = ifp->if_dt_bytes;
12665 
12666 	ASSERT(ifp->if_dt_tcall != NULL);
12667 
12668 	/*
12669 	 * If we went over the threshold, notify NetworkStatistics.
12670 	 * We rate-limit it based on the threshold interval value.
12671 	 */
12672 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12673 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12674 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12675 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12676 		uint64_t now = mach_absolute_time(), deadline = now;
12677 		uint64_t ival;
12678 
12679 		if (tival != 0) {
12680 			nanoseconds_to_absolutetime(tival, &ival);
12681 			clock_deadline_for_periodic_event(ival, now, &deadline);
12682 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12683 			    deadline);
12684 		} else {
12685 			(void) thread_call_enter(ifp->if_dt_tcall);
12686 		}
12687 	}
12688 }
12689 
12690 #if (DEVELOPMENT || DEBUG)
12691 /*
12692  * The sysctl variable name contains the input parameters of
12693  * ifnet_get_keepalive_offload_frames()
12694  *  ifp (interface index): name[0]
12695  *  frames_array_count:    name[1]
12696  *  frame_data_offset:     name[2]
12697  * The return length gives used_frames_count
12698  */
12699 static int
12700 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12701 {
12702 #pragma unused(oidp)
12703 	int *name = (int *)arg1;
12704 	u_int namelen = arg2;
12705 	int idx;
12706 	ifnet_t ifp = NULL;
12707 	u_int32_t frames_array_count;
12708 	size_t frame_data_offset;
12709 	u_int32_t used_frames_count;
12710 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12711 	int error = 0;
12712 	u_int32_t i;
12713 
12714 	/*
12715 	 * Only root can get look at other people TCP frames
12716 	 */
12717 	error = proc_suser(current_proc());
12718 	if (error != 0) {
12719 		goto done;
12720 	}
12721 	/*
12722 	 * Validate the input parameters
12723 	 */
12724 	if (req->newptr != USER_ADDR_NULL) {
12725 		error = EPERM;
12726 		goto done;
12727 	}
12728 	if (namelen != 3) {
12729 		error = EINVAL;
12730 		goto done;
12731 	}
12732 	if (req->oldptr == USER_ADDR_NULL) {
12733 		error = EINVAL;
12734 		goto done;
12735 	}
12736 	if (req->oldlen == 0) {
12737 		error = EINVAL;
12738 		goto done;
12739 	}
12740 	idx = name[0];
12741 	frames_array_count = name[1];
12742 	frame_data_offset = name[2];
12743 
12744 	/* Make sure the passed buffer is large enough */
12745 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12746 	    req->oldlen) {
12747 		error = ENOMEM;
12748 		goto done;
12749 	}
12750 
12751 	ifnet_head_lock_shared();
12752 	if (!IF_INDEX_IN_RANGE(idx)) {
12753 		ifnet_head_done();
12754 		error = ENOENT;
12755 		goto done;
12756 	}
12757 	ifp = ifindex2ifnet[idx];
12758 	ifnet_head_done();
12759 
12760 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12761 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12762 		Z_WAITOK);
12763 	if (frames_array == NULL) {
12764 		error = ENOMEM;
12765 		goto done;
12766 	}
12767 
12768 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12769 	    frames_array_count, frame_data_offset, &used_frames_count);
12770 	if (error != 0) {
12771 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12772 		    __func__, error);
12773 		goto done;
12774 	}
12775 
12776 	for (i = 0; i < used_frames_count; i++) {
12777 		error = SYSCTL_OUT(req, frames_array + i,
12778 		    sizeof(struct ifnet_keepalive_offload_frame));
12779 		if (error != 0) {
12780 			goto done;
12781 		}
12782 	}
12783 done:
12784 	if (frames_array != NULL) {
12785 		kfree_data(frames_array, frames_array_count *
12786 		    sizeof(struct ifnet_keepalive_offload_frame));
12787 	}
12788 	return error;
12789 }
12790 #endif /* DEVELOPMENT || DEBUG */
12791 
12792 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12793 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12794     struct ifnet *ifp)
12795 {
12796 	tcp_update_stats_per_flow(ifs, ifp);
12797 }
12798 
12799 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12800 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12801 {
12802 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12803 }
12804 
12805 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12806 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12807 {
12808 	OSBitAndAtomic(~clear_flags, flags_p);
12809 }
12810 
12811 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12812 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12813 {
12814 	return _set_flags(&interface->if_eflags, set_flags);
12815 }
12816 
12817 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12818 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12819 {
12820 	_clear_flags(&interface->if_eflags, clear_flags);
12821 }
12822 
12823 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12824 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12825 {
12826 	return _set_flags(&interface->if_xflags, set_flags);
12827 }
12828 
12829 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12830 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12831 {
12832 	_clear_flags(&interface->if_xflags, clear_flags);
12833 }
12834 
12835 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12836 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12837 {
12838 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12839 }
12840 
12841 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12842 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12843 {
12844 	if (*genid != ifp->if_traffic_rule_genid) {
12845 		*genid = ifp->if_traffic_rule_genid;
12846 		return TRUE;
12847 	}
12848 	return FALSE;
12849 }
12850 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12851 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12852 {
12853 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12854 	ifnet_update_traffic_rule_genid(ifp);
12855 }
12856 
12857 static void
log_hexdump(void * data,size_t len)12858 log_hexdump(void *data, size_t len)
12859 {
12860 	size_t i, j, k;
12861 	unsigned char *ptr = (unsigned char *)data;
12862 #define MAX_DUMP_BUF 32
12863 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12864 
12865 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12866 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12867 			unsigned char msnbl = ptr[j] >> 4;
12868 			unsigned char lsnbl = ptr[j] & 0x0f;
12869 
12870 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12871 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12872 
12873 			if ((j % 2) == 1) {
12874 				buf[k++] = ' ';
12875 			}
12876 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12877 				buf[k++] = ' ';
12878 			}
12879 		}
12880 		buf[k] = 0;
12881 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12882 	}
12883 }
12884 
12885 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12886 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12887 net_check_compatible_if_filter(struct ifnet *ifp)
12888 {
12889 	if (ifp == NULL) {
12890 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12891 			return false;
12892 		}
12893 	} else {
12894 		if (ifp->if_flt_non_os_count > 0) {
12895 			return false;
12896 		}
12897 	}
12898 	return true;
12899 }
12900 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12901 
12902 #define DUMP_BUF_CHK() {        \
12903 	clen -= k;              \
12904 	if (clen < 1)           \
12905 	        goto done;      \
12906 	c += k;                 \
12907 }
12908 
12909 int dlil_dump_top_if_qlen(char *, int);
12910 int
dlil_dump_top_if_qlen(char * str,int str_len)12911 dlil_dump_top_if_qlen(char *str, int str_len)
12912 {
12913 	char *c = str;
12914 	int k, clen = str_len;
12915 	struct ifnet *top_ifcq_ifp = NULL;
12916 	uint32_t top_ifcq_len = 0;
12917 	struct ifnet *top_inq_ifp = NULL;
12918 	uint32_t top_inq_len = 0;
12919 
12920 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12921 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12922 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12923 
12924 		if (ifp == NULL) {
12925 			continue;
12926 		}
12927 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12928 			top_ifcq_len = ifp->if_snd->ifcq_len;
12929 			top_ifcq_ifp = ifp;
12930 		}
12931 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12932 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12933 			top_inq_ifp = ifp;
12934 		}
12935 	}
12936 
12937 	if (top_ifcq_ifp != NULL) {
12938 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12939 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12940 		DUMP_BUF_CHK();
12941 	}
12942 	if (top_inq_ifp != NULL) {
12943 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12944 		    top_inq_len, top_inq_ifp->if_xname);
12945 		DUMP_BUF_CHK();
12946 	}
12947 done:
12948 	return str_len - clen;
12949 }
12950